(The final exam is coming, so the rough, please understand the reader.) )
First, Concept
K-means is a prototype-based, partitioned clustering technique. It attempts to discover clusters (represented by centroid) of the user-specified number (K). The K-means algorithm accepts the input k, then divides the N data objects into K clusters to satisfy the obtained clusters: objects in the same cluster have higher similarity, while the similarity of objects in different clusters is small. Clustering similarity is obtained by using the mean value of the objects in each cluster to obtain a "central object" (gravitational center) to calculate.
Second, Pseudo Code
1 Select K points as the initial centroid.
2 Repeat
3 assigns each point to the nearest centroid, forming a K-cluster.
4 recalculate the centroid of each cluster.
5 Until centroid is not changed.
Third, Important Data Structures
1 defining the number of clusters, points, and dimensions
#define K 3//k is the number of clusters
#define N//N for points
#define D 2//D for dimensions
2 Types of arrays
Double Point[n][d]; n D-Dimensional points
Double Barycenter_initial[k][d]; K D-dimensional initial centroid position
Double Barycenter_before[k][d]; Record the position of the centroid before each transformation
Double Barycenter_finished[k][d]; Finally get the centroid position
Double O_distance[k]; Record the Euclidean distance of a point for each centroid
int belongwhichbc[n]; Record which cluster each point belongs to
Double Mid[d]; Record intermediate values
3 randomly generated data points
Initialize data points (coordinate values are between 0-100)
void Coordinatedistribution (int n, int d) {
Srand ((unsigned) time (NULL)); Guaranteed randomness
for (int i=0; i<n; i++) {
for (int j=0; j<d; J + +) {
POINT[I][J] = rand ()% 101;
}
}
}
Iv. Source Code
C + + implementation of K-means algorithm
#include <iostream>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <fstream>
using namespace Std;
#define K 3//K is the number of clusters
#define N 20//N is the number of points
#define D 2//D is the number of dimensions
Double point[n][d];//N D-dimensional points
Double barycenter_initial[k][d];//K x D-dimensional initial centroid position
Double barycenter_before[k][d];//record the position of the centroid before each transformation
Double barycenter_finished[k][d];//the resulting centroid position
Double o_distance[k];//to record the Euclidean distance of a point for each centroid
int belongwhichbc[n];//record which cluster each point belongs to
Double mid[d];//Record Middle value
Initialize data points (coordinate values are between 0-100)
void Coordinatedistribution (int n, int d) {
Srand ((unsigned) time (NULL));//Guaranteed randomness
for (int i=0; i<n; i++) {
for (int j=0; j<d; J + +) {
POINT[I][J] = rand ()% 101;
}
}
}
Initialize centroid (coordinate values are between 0-100)
void Initbarycenter (int k, int d) {
for (int i=0; i<k; i++) {
for (int j=0; j<d; J + +) {
BARYCENTER_INITIAL[I][J] = rand ()% 101;
}
}
}
int main (int argc, char** argv) {
Randomly assign d-dimensional coordinates to n points
int n = n, d = D;
Coordinatedistribution (n, D);
First output the values of K, N, D
cout<< "Cluster K =" <<K<<endl<< "number of points N =" <<N<<endl<< "Dimension D =" <<d<<endl& lt;<endl;
Output n Coordinate points
cout<< "system generates n points as follows:" <<endl;
for (int i=0; i<n; i++) {
cout<< "First" <<i+1<< "<<" "T";
for (int j=0; j<d; J + +) {
cout<<point[i][j]<< "\ t";
}
cout<<endl;
}
cout<<endl;
Select K Initial centroid
int k = k;
Initbarycenter (k, D);
Initial centroid generated by the output system
cout<< "System-generated K initial centroid is as follows:" <<endl;
for (int i=0; i<k; i++) {
cout<< "First" <<i+1<< "<<" "T";
for (int j=0; j<d; J + +) {
cout<<barycenter_initial[i][j]<< "\ t";
}
cout<<endl;
}
cout<<endl;
The position of the "first transform before particle" is initialized to the position of initial.
Initialize the position of the "resulting particle" to (-1,-1) so that it is not the same position as before the first transformation
for (int i=0; i<k; i++) {
for (int j=0; j<d; J + +) {
BARYCENTER_BEFORE[I][J] = Barycenter_initial[i][j];
BARYCENTER_FINISHED[I][J] =-1;
}
}
int times = 0;//defines the loop to proceed to the first few
Cyclic calculation
while (true) {
for (int i=0; i<n; i++) {//For each point
for (int j=0; j<k; J + +) {//for K-clusters, Euclidean distance for each cluster
Double sum = 0;
for (int x=0; x<d; x + +) {
sum = sum + POW (point[i][x]-barycenter_before[j][x], 2);
}
O_DISTANCE[J] = sqrt (sum);//Because Sum and sqrt (sum) are positive correlations, to compare the size of the sqrt (sum), simply compare the size of sum
O_DISTANCE[J] = sum;
}
int x = 0, temp = x;//temp contains: The smallest number of Euclidean distances for a point
while (x<k) {
if (O_distance[x] < o_distance[temp]) {
temp = x;
x + +;
}
else {
x + +;
}
}
Belongwhichbc[i] = temp;
}
for (int j=0; j<k; J + +) {
Place all elements in a[] 0
for (int i=0; i<d; i++) {
Mid[i] = 0;
}
int number = 0;//calculates how many points a cluster has
for (int i=0; i<n; i++) {
if (belongwhichbc[i] = = j) {//number of clusters in a point match
number++;
for (int y=0; y<d; y++) {
Mid[y] = Mid[y] + point[i][y];
}
}
}
for (int y=0; y<d; y++) {
Barycenter_finished[j][y] = Mid[y]/number;
}
}
Flag=0, indicating that the Barycenter_before is exactly the same as the elements inside the barycenter_finished, exiting the loop
Flag=1, indicating that the elements in the two are not exactly the same, and still need to loop
int flag = 0;
for (int i=0; i<k; i++) {
for (int j=0; j<d; J + +) {
if (Barycenter_before[i][j]-barycenter_finished[i][j] <= 0.0001) {
Flag = 0;
Continue
}
else {
flag = 1;
Break
}
}
if (flag = = 0) {
Continue
}
else {
Break
}
}
if (flag = = 0) {
times++;
After the cout<< "<<times<<" wheel cycle, the resulting k centroid is as follows: "<<endl;
for (int m=0; m<k; m++) {
cout<< "First" <<m+1<< "<<" "T";
for (int n=0; n<d; n++) {
cout<<barycenter_finished[m][n]<< "\ t";
}
cout<<endl;
}
Break
}
else {
times++;
After the cout<< "<<times<<" wheel cycle, the resulting k centroid is as follows: "<<endl;
for (int m=0; m<k; m++) {
cout<< "First" <<m+1<< "<<" "T";
for (int n=0; n<d; n++) {
cout<<barycenter_finished[m][n]<< "\ t";
}
cout<<endl;
}
To continue the loop, you should use the elements in the barycenter_finished as the elements in the Barycenter_before in the next loop
for (int i=0; i<k; i++) {
for (int j=0; j<d; J + +) {
BARYCENTER_BEFORE[I][J] = Barycenter_finished[i][j];
}
}
Continue
}
}
cout<<endl;
Output final centroid position
cout<< "by K-means algorithm, the centroid of each cluster is as follows:" <<endl;
for (int i=0; i<k; i++) {
cout<< "First" <<i+1<< "<<" "T";
for (int j=0; j<d; J + +) {
cout<<barycenter_finished[i][j]<< "\ t";
}
cout<<endl;
cout<< "The cluster contains the points:" <<endl;
for (int j=0; j<n; J + +) {
if (belongwhichbc[j] = = i) {
cout<<j+1<< "\ t";
}
}
cout<<endl;
}
return 0;
}
V. Results of operation
Note: k=3,n=20,d=2
Figure 1 K-means algorithm run result-1
Figure 2 K-means algorithm run result-2
Figure 3 Displaying the K-means algorithm running results using graph graphing
Data Mining algorithm: C + + implementation of K-means algorithm