Data Mining Algorithm: The C ++ implementation of the dbscan algorithm, and the dbscan
(The final exam is coming soon, so it is rough. Please understand it ..)
I,Concept
DBSCAN is a density-based clustering algorithm that generates clustering. The number of clusters is automatically determined by the algorithm. Points in low-density areas are ignored as noise. Therefore, DBSCAN does not generate full clustering.
II,Pseudocode
1. Mark all points as core points, boundary points, and noise points.
2. Delete the noise point.
3. assign an edge between all core points within the Eps.
4. Each group of connected core points form a cluster.
5. Assign each boundary point to a cluster associated with the core point.
III,Important Data Structure
1. Define the value of the neighborhood radius, density threshold, and number of dataset points.
# Define Eps 3 // the Eps is the neighboring radius value
# Define MinPts 3 // neighborhood density threshold
# Define N 20 // The dataset contains N objects
2 define an array to save all vertices
Double point [N] [2]; // save all data points
3. Define the position where the vector stores core points, boundary points, and noise points.
Vector <int> kernel_point; // stores the position of the core point in point [] []
Vector <int> border_point; // stores the position of the boundary point in point [] []
Vector <int> noise_point; // stores the location of the noise point in point [] [].
4 define the final cluster saved by the vector
Vector <int> cluster; // stores the final cluster. Each cluster contains the position in point [] [].
Iv. Source Code
# Include <iostream>
# Include <cstdlib>
# Include <ctime>
# Include <vector>
# Include <cmath>
Using namespace std;
# Define Eps 3 // the Eps is the neighboring radius value
# Define MinPts 3 // neighborhood density threshold
# Define N 20 // The dataset contains N objects
Double point [N] [2]; // save all data points
Vector <int> kernel_point; // stores the position of the core point in point [] []
Vector <int> border_point; // stores the position of the boundary point in point [] []
Vector <int> noise_point; // stores the location of the noise point in point [] [].
Vector <int> mid; // overlapping clusters may exist.
Vector <int> cluster; // stores the final cluster. Each cluster contains the position in point [] [].
// Initialize N coordinate points
Void init (int n ){
Srand (unsigned) time (NULL ));
For (int I = 0; I <n; I ++ ){
For (int j = 0; j <2; j ++ ){
Point [I] [j] = rand () % (N + 1 );
}
}
}
Int main (int argc, char ** argv ){
// Initialize the dataset
Int n = N;
Init (n );
// Mark all points as core points, boundary points, or noise points
// Mark the core point
For (int I = 0; I <N; I ++ ){
Int num = 0; // determines whether the value exceeds MinPts. If num> = MinPts after a loop, the core point is added.
For (int j = 0; j <N; j ++ ){
If (pow (point [I] [0]-point [j] [0], 2) + pow (point [I] [1]-point [j] [1], 2) <= pow (Eps, 2) {// itself is also
Num ++;
}
}
If (num> = MinPts ){
Kernel_point.push_back (I );
}
}
// Mark as a boundary point or a noise point
For (int I = 0; I <N; I ++ ){
// The boundary point or noise point cannot be the core point
Int flag = 0; // If flag = 0, the vertex is not the core vertex. If flag = 1, the vertex is the core vertex.
For (int j = 0; j <kernel_point.size (); j ++ ){
If (I = kernel_point [j]) {
Flag = 1;
Break;
}
}
If (flag = 0 ){
// Determine whether it is a boundary point or a noise point
Int flag2 = 0; // If flag = 0, the point is the boundary point. If flag = 1, the point is the noise point.
For (int j = 0; j <kernel_point.size (); j ++ ){
Int s = kernel_point [j]; // mark the position of the j-th core point in point [] [] for convenient calling
If (pow (point [I] [0]-point [s] [0], 2) + pow (point [I] [1]-point [s] [1], 2) <pow (Eps, 2 )){
Flag2 = 0;
Border_point.push_back (I );
Break;
}
Else {
Flag2 = 1;
Continue;
}
}
If (flag2 = 1 ){
// Add noise points
Noise_point.push_back (I );
Continue;
}
}
Else {
Continue;
}
}
// Place the core point within the Eps in a vector
For (int I = 0; I <kernel_point.size (); I ++ ){
Int x = kernel_point [I];
Vector <int> record; // create a record for each vertex and put it in mid.
Record. push_back (x );
For (int j = I + 1; j <kernel_point.size (); j ++ ){
Int y = kernel_point [j];
If (pow (point [x] [0]-point [y] [0], 2) -pow (point [x] [1]-point [y] [1], 2) <pow (Eps, 2 )){
Record. push_back (y );
}
}
Mid. push_back (record );
}
// Merge the vector
For (int I = 0; I <mid. size (); I ++) {// for each row in the mid
// Determine whether the row has been added to a previous row
If (mid [I] [0] =-1 ){
Continue;
}
// If it has not been determined
For (int j = 0; j <mid [I]. size (); j ++) {// judge each of these values
// Determine whether other rows exist for each value
For (int x = I + 1; x <mid. size (); x ++) {// for each subsequent row
If (mid [x] [0] =-1 ){
Continue;
}
For (int y = 0; y <mid [x]. size (); y ++ ){
If (mid [I] [j] = mid [x] [y]) {
// If the same element exists, put it in a vector, add precluster after the loop, and set all element values in the vector to-1.
For (int a = 0; a <mid [x]. size (); a ++ ){
Mid [I]. push_back (mid [x] [a]);
Mid [x] [a] =-1;
}
Break;
}
}
}
}
Cluster. push_back (mid [I]);
}
// Delete duplicate elements in the cluster
For (int I = 0; I <cluster. size (); I ++) {// for each row
For (int j = 0; j <cluster [I]. size (); j ++ ){
For (int n = j + 1; n <cluster [I]. size (); n ++ ){
If (cluster [I] [j] = cluster [I] [n]) {
Cluster [I]. erase (cluster [I]. begin () + n );
N --;
}
}
}
}
// At This point, each cluster is saved, and each cluster has a location corresponding to point [] []
// Assign each boundary point to a cluster associated with the core point
For (int I = 0; I <border_point.size (); I ++) {// for each boundary point
Int x = border_point [I];
For (int j = 0; j <cluster. size (); j ++) {// check each cluster, determine the core node associated with the edge node, and add the edge node to the cluster where the first core node appears.
Int flag = 0; // flag = 0 indicates that no matching item exists. flag = 1 indicates that a matching item exists and the loop is exited.
For (int k = 0; k <cluster [j]. size (); k ++ ){
Int y = cluster [j] [k];
If (pow (point [x] [0]-point [y] [0], 2) + pow (point [x] [1]-point [y] [1], 2) <pow (Eps, 2 )){
Cluster [j]. push_back (x );
Flag = 1;
Break;
}
}
If (flag = 1 ){
Break;
}
}
}
/*************************************** **************************************** ************/
Cout <"All Points:" <endl;
For (int I = 0; I <N; I ++ ){
Cout <"no." <I <"no." <"\ t ";
For (int j = 0; j <2; j ++ ){
Cout <point [I] [j] <"\ t ";
}
Cout <endl;
}
Cout <endl;
Cout <"Kernel Points:" <endl;
For (int I = 0; I <kernel_point.size (); I ++ ){
Cout <kernel_point [I] <"\ t ";
}
Cout <endl;
Cout <"Border Points:" <endl;
For (int I = 0; I <border_point.size (); I ++ ){
Cout <border_point [I] <"\ t ";
}
Cout <endl;
Cout <"Noise Points:" <endl;
For (int I = 0; I <noise_point.size (); I ++ ){
Cout <noise_point [I] <"\ t ";
}
Cout <endl;
Cout <"Cluster:" <endl;
For (int I = 0; I <cluster. size (); I ++ ){
Cout <"no." <I <"no." <"\ t ";
For (int j = 0; j <cluster [I]. size (); j ++ ){
Cout <cluster [I] [j] <"\ t ";
}
Cout <endl;
}
Return 0;
}
V,Running result
Figure 1 running result of the DBSCAN algorithm
Figure 2 demonstrate the running results of the DBSCAN algorithm using Graph
(Pink points are noise points, and blue and yellow are two clusters)