map:vector featurevector = Features.get ();
if (Featurevector.size () < minvectorsize) {return; }//Initialize the Minhash values to highest for (int i = 0; i < numhashfunctions; i++) {minhashvalues[
I] = Integer.max_value;
for (int i = 0; i < numhashfunctions i++) {for (vector.element ele:featureVector.nonZeroes ()) { int value = HashValue?
(int) Ele.get (): Ele.index ();
Bytestohash[0] = (byte) (value >> 24);
BYTESTOHASH[1] = (byte) (value >> 16);
BYTESTOHASH[2] = (byte) (value >> 8);
BYTESTOHASH[3] = (byte) value;
int hashindex = Hashfunction[i].hash (Bytestohash);
If we are new hash value is less than the "old one," replace the old one if (Minhashvalues[i] > Hashindex) {
Minhashvalues[i] = Hashindex; }}//output the cluster information for (int i = 0; i < numhashfunctions; i++) {StRingbuilder Clusteridbuilder = new StringBuilder (); for (int j = 0; J < keygroups J + +) {clusteridbuilder.append (minhashvalues[(i + j)% numhashfunctions]). Append
('-');
}//remove the last dash Clusteridbuilder.deletecharat (Clusteridbuilder.length ()-1);
Cluster.set (Clusteridbuilder.tostring ());
if (debugoutput) {vector.set (featurevector);
Context.write (cluster, vector);
else {context.write (cluster, item); }
}
protected void reduce (Text cluster, iterable<writable> points, context context)
throws IOException, interruptedexception {
collection<writable> pointlist = Lists.newarraylist ();
for (writable point:points) {
if (debugoutput) {
Vector pointvector = ((vectorwritable) point). Get (). Clone ();
writable writablepointvector = new vectorwritable (pointvector);
Pointlist.add (Writablepointvector);
} else {
writable pointtext = new Text (point.tostring ());
Pointlist.add (Pointtext);
}
if (Pointlist.size () >= minclustersize) {
context.getcounter (clusters.accepted). Increment (1);
for (writable point:pointlist) {
context.write (cluster, point);
}
} else {
context.getcounter ( clusters.discarded). Increment (1);
}
}
See more highlights of this column: http://www.bianceng.cnhttp://www.bianceng.cn/Programming/sjjg/