Package Com.common;
Import java.io.UnsupportedEncodingException;
Import Java.math.BigDecimal;
Import Java.util.HashMap;
Import Java.util.Iterator;
Import Java.util.Map;
/**
* @description timed Tasks-compare similarities
*/
public class Cosinesimilaralgorithm {
public static double Getsimilarity (string Doc1, String doc2) {
if (Doc1! = null && Doc1.trim (). Length () > 0 && doc2! = null && Doc2.trim (). Length () > 0) {
Map<integer, int[]> algorithmmap = new Hashmap<integer, int[]> ();
Encapsulates the Chinese characters and the total number of occurrences in the two string into a algorithmmap
for (int i = 0; i < doc1.length (); i++) {
char D1 = Doc1.charat (i);
if (Ishanzi (D1)) {
int charIndex = getgb2312id (D1);
if (charIndex! =-1) {
int[] Fq = Algorithmmap.get (CharIndex);
if (FQ! = null && fq.length = = 2) {
fq[0]++;
} else {
FQ = new Int[2];
Fq[0] = 1;
FQ[1] = 0;
Algorithmmap.put (CharIndex, FQ);
}
}
}
}
for (int i = 0; i < doc2.length (); i++) {
Char D2 = Doc2.charat (i);
if (Ishanzi (D2)) {
int charIndex = GETGB2312ID (D2);
if (charIndex! =-1) {
int[] Fq = Algorithmmap.get (CharIndex);
if (FQ! = null && fq.length = = 2) {
fq[1]++;
} else {
FQ = new Int[2];
Fq[0] = 0;
FQ[1] = 1;
Algorithmmap.put (CharIndex, FQ);
}
}
}
}
iterator<integer> Iterator = Algorithmmap.keyset (). Iterator ();
Double Sqdoc1 = 0;
Double sqdoc2 = 0;
Double denominator = 0;
while (Iterator.hasnext ()) {
Int[] C = Algorithmmap.get (Iterator.next ());
Denominator + = c[0] * c[1];
Sqdoc1 + = c[0] * C[0];
SQDOC2 + = c[1] * c[1];
}
Double origin = denominator/math.sqrt (Sqdoc1 * sqdoc2);
if (String.valueof (Origin). Equals ("NaN")) {
return double.valueof ("0");
}
BigDecimal bg = new BigDecimal (origin);
Double f1 = Bg.setscale (2, bigdecimal.round_half_up). Doublevalue ();
return F1;
} else {
throw new NullPointerException ("The Document is null or has not cahrs!!");
}
}
public static Boolean Ishanzi (char ch) {
Judging whether Chinese characters
Return (ch >= 0x4e00 && ch <= 0x9fa5);
}
/**
* gets its GB2312 encoding or ASCII encoding based on the input Unicode character,
*
* @param ch
* Input GB2312 Chinese characters or ASCII characters (128)
* @retur n Ch position in GB2312, 1 indicates that the character does not know
*/
public static short getgb2312id (char ch) {
try {
byte[] buffer = character.t Ostring (CH). GetBytes ("GB2312");
if (buffer.length! = 2) {
//normally buffer should be two bytes, otherwise the description ch is not GB2312 encoded, so return '? ', at this time the description does not recognize the character
Return-1;
}
int b0 = (buffer[0] & 0X0FF)-161;//encoding starts from A1, so subtract 0xa1=161
int b1 = (buffer[1] & 0X0FF)-161;//first character and The last character has no kanji, so each zone only 16*6-2=94 characters
return (short) (B0 * 94 + B1);
} catch (Unsupportedencodingexception e) {
E.pri Ntstacktrace ();
}
return-1;
}
public static void Main (string[] args) {
// //
String str1 = "Rrr11ttrrr";
String str2 = "Titl 456 e4";
String STR3 = "The killer is not too cold";
String STR4 = "killer cold";
String STR5 = "This killer is not";
//
System.out.println (Getsimilarity (str1, str2));
System.out.println (Getsimilarity (str1, STR3));
System.out.println (Getsimilarity (str1, STR4));
System.out.println (Getsimilarity (str1, STR5));
//
// }
}
Comparison of similarity of Chinese characters