People are constant "birds of a feather flock together", clustering is the process of dividing a given document into clusters of similar items.
The process of clustering design:
(1) A clustering algorithm (K-means, Fuzzy k-means, canopy, etc.)
(2) The concept of similarity and heterogeneity
A. European-style distance
B. Square Euclidean distance
C. Manhattan Distance
D. Cosine distance measure
E. Valley distance measure
F. Weighted distance measure (TF-IDF frequency-reverse document frequency)
(3) Conditions of termination
A K-means clustering algorithm based on Euclidean distance measure Java implementation is as follows:
Import Java.io.File;
Import java.io.IOException;
Import java.util.ArrayList;
Import java.util.List;
Import org.apache.hadoop.conf.Configuration;
Import Org.apache.hadoop.fs.FileSystem;
Import Org.apache.hadoop.fs.Path;
Import org.apache.hadoop.io.IntWritable;
Import org.apache.hadoop.io.LongWritable;
Import Org.apache.hadoop.io.SequenceFile;
Import Org.apache.hadoop.io.Text;
Import org.apache.mahout.clustering.WeightedVectorWritable;
Import Org.apache.mahout.clustering.kmeans.Cluster;
Import Org.apache.mahout.clustering.kmeans.KMeansDriver;
Import Org.apache.mahout.common.distance.EuclideanDistanceMeasure;
Import Org.apache.mahout.math.RandomAccessSparseVector;
Import Org.apache.mahout.math.Vector;
Import org.apache.mahout.math.VectorWritable;
public class Simplekmeansclustering {public static final double[][] points = {1, 1}, {2, 1}, {1, 2},
{2, 2}, {3, 3}, {8, 8}, {9, 8}, {8, 9}, {9, 9}};
public static void Writepointstofile (list<vector> points, String FileName, FileSystem FS, Configuration conf) throws I
oexception {Path PATH = new Path (fileName);
Sequencefile.writer Writer = new Sequencefile.writer (FS, conf, path, Longwritable.class, Vectorwritable.class);
Long recnum = 0;
Vectorwritable VEC = new vectorwritable ();
for (Vector point:points) {vec.set (point);
Writer.append (New Longwritable (recnum++), VEC);
} writer.close (); public static list<vector> getpoints (double[][] raw) {list<vector> points = new Arraylist<vecto
R> ();
for (int i = 0; i < raw.length i++) {double[] fr = Raw[i];
Vector VEC = new Randomaccesssparsevector (fr.length);
Vec.assign (FR);
Points.Add (VEC);
return points; public static void Main (String arGs[]) throws Exception {int k = 2;
list<vector> vectors = getpoints (points);
File TestData = new file ("TestData");
if (!testdata.exists ()) {Testdata.mkdir ();
} testData = new File ("testdata/points");
if (!testdata.exists ()) {Testdata.mkdir ();
} Configuration conf = new Configuration ();
FileSystem fs = Filesystem.get (conf);
Writepointstofile (vectors, "testdata/points/file1", FS, conf);
Path PATH = new Path ("testdata/clusters/part-00000");
Sequencefile.writer Writer = new Sequencefile.writer (FS, conf, path, Text.class, Cluster.class);
for (int i = 0; i < K; i++) {Vector VEC = Vectors.get (i);
Cluster Cluster = new Cluster (VEC, I, New Euclideandistancemeasure ());
Writer.append (New Text (Cluster.getidentifier ()), cluster);
} writer.close (); Kmeansdriver.run (conf, New Path ("Testdata/points"), New Path ("Testdata/clusters"), New Path ("OUTput "), New Euclideandistancemeasure (), 0.001, true, false);
Sequencefile.reader Reader = new Sequencefile.reader (FS, New Path ("output/" + cluster.clustered_points_dir
+ "/part-m-00000"), conf);
Intwritable key = new intwritable ();
Weightedvectorwritable value = new weightedvectorwritable ();
while (Reader.next (key, value)) {System.out.println (value.tostring () + ' belongs to cluster ')
+ key.tostring ());
} reader.close ();
}
}