The kmeans algorithm is implemented in C language and can be used to cluster data of different dimensions.
Recently, it took me two days to implement the serial version after thinking about MPI parallelization of the kmeans algorithm.
Clustering is to give a set of elements V, where each element has d observed attributes, using an algorithm to divide V into k subsets, the differences between elements in each subset are required to be as low as possible, while those in different subsets are as high as possible.
The following is a flowchart from google to the algorithm, which is clearly understood:
1. Randomly select k data points in the dataset as the initial cluster center:
2. Calculate the shortest cluster center corresponding to each data point ::
3. Use the current cluster to recalculate the center point:
4. Repeat steps 2 and 3 until convergence (reaching the maximum number of iterations or moving distance from the cluster center is extremely small ):
Code:
1 # include <stdio. h> 3 # include <stdlib. h> 4 # include <math. h> 5 # include <time. h> 6 7 int K, N, D; // number of clusters, data volume, data dimension 8 float ** data; // store data 9 int * in_cluster; // mark the cluster 10 float ** cluster_center for each vertex; // store the center of each cluster 11 12 float ** array (int m, int n ); 13 void freearray (float ** p); 14 float ** loadData (int * k, int * d, int * n); 15 float getDistance (float avector [], float bvector [], int n); 16 void cluster (); 17 Float getDifference (); 18 void getCenter (int in_cluster [N]); 19 20 int main () 21 {22 int I, j, count = 0; 23 float temp1, temp2; 24 data = loadData (& K, & D, & N); 25 printf ("Data sets: \ n"); 26 for (I = 0; I <N; I ++) 27 for (j = 0; j <D; j ++) {28 printf ("%-8.2f", data [I] [j]); 29 if (j + 1) % D = 0) putchar ('\ n'); 30} 31 printf ("------------------------------- \ n "); 32 33 srand (unsigned int) (time (NULL); // random initialization of k centers 34 for (I = 0; I <K; I ++) 35 for (j = 0; j <D; j ++) 36 cluster_center [I] [j] = data [(int) (N * rand ()/(RAND_MAX + 1.0)] [j]; 37 38 cluster (); // clustering with k random centers 39 temp1 = getDifference (); // The sum of The distance between The first central point and The data point is 40 count ++; 41 printf ("The difference between data and center is: %. 2f \ n ", temp1); 42 43 getCenter (in_cluster); 44 cluster (); // use The new k centers for The second cluster 45 temp2 = getDifference (); 46 count ++; 47 printf ("The difference between data D center is: %. 2f \ n ", temp2); 48 49 while (fabs (temp2-temp1 )! = 0) {// compare the first and second iterations. If not equal, continue iteration 50 temp1 = temp2; 51 getCenter (in_cluster); 52 cluster (); 53 temp2 = getDifference (); 54 count ++; 55 printf ("The % dth difference between data and center is: %. 2f \ n ", count, temp2); 56} 57 58 printf (" The total number of clusters is: % d \ n ", count ); // counting iterations 59 system ("pause"); 60 return 0; 61} 62 63 64 // dynamically create a two-dimensional array 65 float ** array (int m, int n) 66 {67 float ** p; 68 p = (float **) m Alloc (m * sizeof (float *); 69 p [0] = (float *) malloc (m * n * sizeof (float )); 70 for (int I = 1; I <m; I ++) p [I] = p [I-1] + n; 71 return p; 72} 73 74 // release the memory occupied by the two-dimensional array 75 void freearray (float ** p) 76 {77 free (* p); 78 free (p ); 79} 80 81 // import data from data.txt. The first line format is required: K = number of clusters, D = data dimension, N = data volume 82 float ** loadData (int * k, int * d, int * n) 83 {84 float ** arraydata; 85 FILE * fp; 86 if (fp = fopen ("data.txt", "r ")) = NULL) fprintf (stderr, "c Annot open data.txt! \ N "); 87 if (fscanf (fp," K = % d, D = % d, N = % d \ n ", k, d, n )! = 3) fprintf (stderr, "load error! \ N "); 88 arraydata = array (* n, D); // generate the data array 89 cluster_center = array (* k, D ); // The center of the cluster 90 in_cluster = (int *) malloc (* n * sizeof (int )); // The flag array 91 for (int I = 0; I <* n; I ++) 92 for (int j = 0; j <D; j ++) 93 fscanf (fp, "% f", & arraydata [I] [j]); // read data point 94 return arraydata; 95} 96 97 // calculate the Euclidean distance from 98 float getDistance (float avector [], float bvector [], int n) 99 {100 int I; 101 float sum = 0.0; 102 for (I = 0; I <n; I ++) 103 sum + = pow (avector [I]-bvector [I], 2 ); 104 return sqrt (sum); 105} 106 107 // cluster N data points to indicate which cluster each point belongs to. 108 void cluster () 109 {110 int I, j; 111 float min; 112 float ** distance = array (N, K); // the distance between each data point and each center is 113. // float distance [N] [K]; // You can also use the C99 Variable Length array 114 for (I = 0; I <N; ++ I) {115 min = 9999.0; 116 for (j = 0; j <K; ++ j) {117 distance [I] [j] = getDistance (data [I], cluster_center [j], D ); 118 // printf ("% f \ n", distance [I] [j]); 119 if (distance [I] [j] <min) {120 min = distance [I] [j]; 121 in_cluster [I] = j; 122} 123} 124 printf ("data [% d] in cluster-% d \ n", I, in_cluster [I] + 1 ); 125} 126 printf ("--------------------------- \ n"); 127 free (distance); 128} 129 130 // calculates the sum of the distance between the centers of all clusters and their data points. 131 float getDifference () 132 {133 int I, j; 134 float sum = 0.0; 135 for (I = 0; I <K; ++ I) {136 for (j = 0; j <N; ++ j) {137 if (I = in_cluster [j]) 138 sum + = getDistance (data [j], cluster_center [I], D ); 139} 140} 141 return sum; 142} 143 144 // calculate the center of each cluster. 145 void getCenter (int in_cluster [N]) 146 {147 float ** sum = array (K, D); // store each cluster center 148 // float sum [K] [D]; // You can also use the C99 Variable Length array 149 int I, j, q, count; 150 for (I = 0; I <K; I ++) 151 for (j = 0; j <D; j ++) 152 sum [I] [j] = 0.0; 153 for (I = 0; I <K; I ++) {154 count = 0; // count all data points belonging to a certain cluster. 155 for (j = 0; j <N; j ++) {156 if (I = in_cluster [j]) {157 for (q = 0; q <D; q ++) 158 sum [I] [q] + = data [j] [q]; // calculate the sum of the corresponding dimensions of all data points of the cluster: 159 count ++; 160} 161} 162 for (q = 0; q <D; q ++) 163 cluster_center [I] [q] = sum [I] [q]/count; 164} 165 printf ("The new center of cluster is: \ n "); 166 for (I = 0; I <K; I ++) 167 for (q = 0; q <D; q ++) {168 printf ("%-8.2f ", cluster_center [I] [q]); 169 if (q + 1) % D = 0) putchar ('\ n'); 170} 171 free (sum ); 172}
This program supports datasets of different dimensions. For example, the dataset data.txt is as follows:
K = 3, D = 3, N = 15
-25 22.2 35.34
31.2-14.4 23
32.02-23 24.44
-25.35 36.3-33.34
-20.2 27.333-28.22
-15.66 17.33-23.33
26.3-31.34 16.3
-22.544 16.2-32.22
12.2-15.22 22.11
-41.241 25.232-35.338
-22.22 45.22 23.55
-34.22 50.14 30.98
15.23-30.11 20.987
-32.5 15.3-25.22
-38.97 20.11 33.22