. NET text similarity algorithm cosine theorem and Simhash analysis and application

Last Update:2014-12-30 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Cosine similarity

Principle: First We first put two paragraphs of text participle, list all the words, and then we calculate the frequency of each word, and finally the word into a vector, so we just need to calculate the similarity of the two vectors . we simply stated as follows Text 1: I/Love/Beijing/Tiananmen After the word frequency to get vector (pseudo-vector) [1,1,1,1] Text 2: we/All love/Beijing/Tiananmen Square/through participle to find the word frequency vector (pseudo-vector) [1,0,1,2] we can think of them as two lines in space, Are all from the origin ([0, 0, ...] ), pointing in a different direction. An angle is formed between the two segments, if the angle is 0 degrees, which means that the direction is the same, the line is coincident, and if the angle is 90 degrees, it means a right angle, a completely different direction, and if the angle is 180 degrees, it means that the direction is opposite. Therefore, we can judge the similarity of vectors by the size of the angle. The smaller the angle, the more similar the representation. c# Core algorithms public class tfidfmeasure { private S Tring[] _docs; private string[][] _ngramdoc; private int _numdocs=0; private int _numterms=0; Private ArrayList _terms; &NB Sp Private int[][] _termfreq; private float[][] _termweight; PRI Vate int[] _maxtermfreq; private int[] _docfreq; Public Class termvector { public static float Computecos Inesimilarity (float[] vector1, float[] vector2) { &NBS P if (vector1. Length! = Vector2. Length) &NB Sp Throw new Exception ("Difer LENGTH"); &nbsp ; float denom= (vectorlength (vector1) * Vectorlength (Vector2)); &NBSP ; if (denom = = 0F) & nbsp return 0F; ELSE &NBSP innerproduct return (vect Or1, Vector2)/denom); &NB Sp } public static float innerproduct (float[] vector1, float[] vector2) { &NBS P if (vector1. Length! = Vector2. Length) throw new Exception ("differ length is not AL lowed "); float result=0f; for (int i=0; i < Vector1. Length; i++) result + vector1[i] * vector2[i]; return Result;&nbs P } Public stat IC float vectorlength (float[] vector) { float sum=0.0f; &N Bsp for (int i=0; i < vector. Length; i++) Sum=sum + (vector[i] * vector[i]); &NB sp; return (float) math.sqrt (sum); &nbsp } } Private IDictionary _wor Dsindex=new Hashtable (); public tfidfmeasure (string[] documents) &N Bsp { _docs=documents; _numdocs=documents. Length; Myinit (); } private void Generatngramtext () { & nbsp } Private ArrayList generateterms (string[] docs) {&nbsp ; ArrayList uniques=new ArrayList (); _ngramdoc= New string[_numdocs][]; for (int i=0; i < docs. Length; i++) &NBSP { Tokeniser tokenizer=new tokeniser (); string[] Words=tokenizer. Partition (Docs[i]); for (int j=0; J &lt ; Words. Length; J + +) if (!uniques. Contains (Words[j]) &NBS P uniques. ADD (Words[j]); &N bsp; } return uniques; &nbsp ; } private static object AddElement (I Dictionary collection, objECT key, Object NewValue) { Object element=collection[ key]; collection[key]=newvalue; return E lement; } private int gettermindex (string term) & nbsp { Object index=_wordsindex[term]; if (index = = null) return-1; return (int) index; }&N bsp; private void Myinit () { &NBSP ; _terms=generateterms (_docs); _numterms=_terms. Count; _maxtermfreq=new Int[_numdocs]; _docfreq=new Int[_numterms]; _termfreq =new int[_numterms][]; _termweight=new FL Oat[_numterms][]; for (int i=0; i < _terms. Count; i++) { &NBSP ; _termweight[i]=new Float[_numdocs]; _termfreq [I]=new Int[_numdocs]; addelement (_wordsindex, _terms[i], i) ; } &NB sp; generatetermfrequency (); Gene Ratetermweight (); } private float Log (float num) { &NBS P return (float) Math.Log (num);//log2 } PR ivate void Generatetermfrequency () { for (int i=0; I &l T _numdocs ; i++) &NBS P string curdoc=_docs[i]; &N Bsp IDictionary freq=getwordfrequency (curdoc); &NB Sp IDictionaryEnumerator Enums=freq. GetEnumerator (); _maxtermfreq[i]=int. MinValue; while (enums. MoveNext ()) &NBSp { St Ring word= (String) enums. key; int wordfreq= (int) enums. Value; int termindex=gettermindex (word); _termfreq [termindex][i]=wordfreq; &NB Sp _docfreq[termindex] ++; &N Bsp if (Wordfreq > _maxtermfreq[i]) _maxtermfreq[i]=wordfreq; &NBS P } } } &NB Sp private void generatetermweight () &nbsP { for (int i=0; I & Lt _numterms ; i++) { for (int j=0; J < _numdocs; J + +) _termweight[i][j]=computetermweight (I, j); } &NB Sp } private float gettermfrequency (int term, int doc) { & nbsp int freq=_termfreq [term][doc]; & nbsp INT Maxfreq=_maxtermfreq[doc]; &nbsp Return (float) freq/(float) maxfreq); } Private float getinversedocumentfrequency (int term) { int DF=_DOCFR eq[term]; return Log ((float) (_numdocs)/(float) DF); &NBSP ; } private float computetermweight (int term, int doc) { & nbsp float tf=gettermfrequency (term, doc); float Idf=geti Nversedocumentfrequency (term); return TF * idf; }&NBSP ; private float[] Gettermvector (int doc) &NB Sp { float[] w=new float[_numterms]; for ( Inti=0; i < _numterms; i++) &NBSP ; w[i]=_termweight[i][doc]; return w; } public float getsimilarity T doc_i, int doc_j) { float[] Vector1=gettermvector (d oc_i); float[] Vector2=gettermvector (doc_j); &N Bsp Return termvector.computecosinesimilarity (Vector1, Vector2); } & nbsp Private IDictionary getwordfrequency (string input) &NBSp { string convertedinput=input. ToLower (); and nbsp Tokeniser tokenizer=new Tokeniser (); string[] Words=tokenizer. Partition (Convertedinput); Array.Sort (words); string[] distinctwords=getdistinctwords (words); &NBSP ; IDictionary result=new Hashtable (); for (int i=0; i < Distinctwords.len Gth i++) { Object tmp; & nbsp Tmp=counTwords (Distinctwords[i], words); result[distinctwords[i]]=tmp;& nbsp } &NBS P return result; } &NB Sp Private string[] Getdistinctwords (string[] input) { if (input = = null) return new string[0]; else { ARRaylist list=new ArrayList (); &NB Sp for (int i=0; i < input. Length; i++) if (!list. Contains (Input[i])//N-gram similarity? &NBS P list. ADD (Input[i]); &NBSP ; Return Tokeniser.arraylisttoarray (list); } }&N Bsp private int countwords (Strin G Word, string[] words) { int itemidx=array.binarysear CH (words, Word); &NBsp; if (Itemidx > 0) &NB Sp while (Itemidx > 0 && words[itemidx]. Equals (word)) &NBS P itemidx--; &NBS P int count=0; while ( Itemidx < words. Length && itemidx >= 0) { &NB Sp if (Words[itemidx]. Equals (word)) count++; itemidx++; &nbsP if (Itemidx < words. Length) &NB Sp if (!words[itemidx]. Equals (word)) break; &NBS p; } &NB Sp return count; } &NB Sp } disadvantage because it is possible that the characteristic of an article to a quantifier is so high that the whole vector dimension is so expensive that it is not suitable for the calculation of large data volume. The main idea of simhash principle algorithm is to reduce dimension and map high dimensional eigenvector into a f-bit fingerprint (fingerprint) by comparing the f-bit fingerprint of two articles Hamming Distance to determine whether the article is repetitive or highly approximate. Since each article we can calculate in advance Hamming distance to save, then directly through the Hamming distance to calculate, so the speed is very fast for big data calculation. google is based on this algorithm to achieve the Web page file check weight. We assume the following three paragraphs of text: 1,the cat sat on the mat 2,the cat sat on a mat 3,we all scream How does the for ice cream implement this hash algorithm? Take the above three texts as an example, the whole process can be divided into the following six steps: 1, select the number of Simhash, please consider the cost of storage and the size of the data set, such as 32-bit 2, simhash you initialize 0 3, extract the original text features , generally use a variety of word segmentation methods. For example, "The cat sat on the mat" uses 22 participle to get the following results: {"th", "he", "E", "C", "Ca", "at", "T", "s", "sa", "O", "on", "n", "T" , "M", "Ma"} 4, uses the traditional 32-bit hash function to calculate the hashcode of each word, for example: "th". Hash = -502157718 hash = -369049682,...... 5 , each word hashcode each bit, if the bit is 1, then simhash the corresponding bit value plus 1, otherwise minus 1 6, to the last 32-bit simhash, if the bit is greater than 1, then set to 1;

. NET text similarity algorithm cosine theorem and Simhash analysis and application

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

. NET text similarity algorithm cosine theorem and Simhash analysis and application

Contact Us

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support