. NET text similarity algorithm cosine theorem and Simhash analysis and application

Source: Internet
Author: User

Cosine similarity

Principle: First We first put two paragraphs of text participle, list all the words, and then we calculate the frequency of each word, and finally the word into a vector, so we just need to calculate the similarity of the two vectors .  we simply stated as follows   Text 1: I/Love/Beijing/Tiananmen After the word frequency to get vector (pseudo-vector)  [1,1,1,1]  Text 2: we/All love/Beijing/Tiananmen Square/through participle to find the word frequency vector (pseudo-vector)  [1,0,1,2]  we can think of them as two lines in space, Are all from the origin ([0, 0, ...] ), pointing in a different direction. An angle is formed between the two segments, if the angle is 0 degrees, which means that the direction is the same, the line is coincident, and if the angle is 90 degrees, it means a right angle, a completely different direction, and if the angle is 180 degrees, it means that the direction is opposite. Therefore, we can judge the similarity of vectors by the size of the angle. The smaller the angle, the more similar the representation.  c# Core algorithms        public class tfidfmeasure    {        private S Tring[] _docs;        private string[][] _ngramdoc;        private int _numdocs=0;         private int _numterms=0;        Private ArrayList _terms;    &NB Sp   Private int[][] _termfreq;        private float[][] _termweight;        PRI Vate int[] _maxtermfreq;        private int[] _docfreq;          Public Class termvector       {                   public static float Computecos Inesimilarity (float[] vector1, float[] vector2)             {        &NBS P       if (vector1. Length! = Vector2. Length)                                &NB Sp   Throw new Exception ("Difer LENGTH");                   &nbsp ;             float denom= (vectorlength (vector1) * Vectorlength (Vector2));    &NBSP ;           if (denom = = 0F)                    & nbsp               return 0F;                                ELSE       &NBSP                          innerproduct return (vect Or1, Vector2)/denom);                          &NB Sp }             public static float innerproduct (float[] vector1, float[] vector2)             {                      &NBS P     if (vector1. Length! = Vector2. Length)                     throw new Exception ("differ length is not AL lowed ");                                            float result=0f;                for (int i=0; i < Vector1. Length; i++)                                    result + vector1[i] * vector2[i];                                return Result;&nbs P          }                    Public stat IC float vectorlength (float[] vector)             {                            float sum=0.0f;              &N Bsp for (int i=0; i < vector. Length; i++)                                    Sum=sum + (vector[i] * vector[i]);                      &NB sp;                 return (float) math.sqrt (sum);  &nbsp        }        }         Private IDictionary _wor Dsindex=new Hashtable ();         public tfidfmeasure (string[] documents)       &N Bsp {            _docs=documents;            _numdocs=documents. Length;            Myinit ();       }         private void Generatngramtext ()         {                & nbsp  }         Private ArrayList generateterms (string[] docs)         {&nbsp ;           ArrayList uniques=new ArrayList ();            _ngramdoc= New string[_numdocs][];            for (int i=0; i < docs. Length; i++)       &NBSP     {                Tokeniser tokenizer=new tokeniser ();                string[] Words=tokenizer. Partition (Docs[i]);                             for (int j=0; J &lt ; Words. Length; J + +)                     if (!uniques. Contains (Words[j])                            &NBS P           uniques. ADD (Words[j]);                              &N bsp;            }            return uniques;  &nbsp ;    }                  private static object AddElement (I Dictionary collection, objECT key, Object NewValue)         {            Object element=collection[ key];            collection[key]=newvalue;            return E lement;       }         private int gettermindex (string term)     & nbsp   {            Object index=_wordsindex[term];            if (index = = null) return-1;            return (int) index;       }&N bsp;        private void Myinit ()         {          &NBSP ; _terms=generateterms (_docs);            _numterms=_terms. Count;             _maxtermfreq=new Int[_numdocs];            _docfreq=new Int[_numterms];            _termfreq =new int[_numterms][];            _termweight=new FL Oat[_numterms][];             for (int i=0; i < _terms. Count; i++)                        {        &NBSP ;       _termweight[i]=new Float[_numdocs];                _termfreq [I]=new Int[_numdocs];                 addelement (_wordsindex, _terms[i], i) ;                       }          &NB sp;             generatetermfrequency ();            Gene Ratetermweight ();                                   }               private float Log (float num)         {    &NBS P       return (float) Math.Log (num);//log2       }         PR ivate void Generatetermfrequency ()         {            for (int i=0; I &l T _numdocs  ; i++)             &NBS                     P                          string curdoc=_docs[i];  &N Bsp             IDictionary freq=getwordfrequency (curdoc);          &NB Sp     IDictionaryEnumerator Enums=freq. GetEnumerator ();                _maxtermfreq[i]=int. MinValue;                while (enums. MoveNext ()) &NBSp               {                    St Ring word= (String) enums. key;                    int wordfreq= (int) enums. Value;                    int termindex=gettermindex (word);                     _termfreq [termindex][i]=wordfreq;      &NB Sp             _docfreq[termindex] ++;               &N Bsp     if (Wordfreq > _maxtermfreq[i]) _maxtermfreq[i]=wordfreq;                                  &NBS P }           }       }           &NB Sp     private void generatetermweight () &nbsP       {                       for (int i=0; I & Lt _numterms  ; i++)             {                for (int j=0; J < _numdocs; J + +)                                    _termweight[i][j]=computetermweight (I, j);                           }      &NB Sp }         private float gettermfrequency (int term, int doc)         {  & nbsp                    int freq=_termfreq [term][doc];      & nbsp     INT Maxfreq=_maxtermfreq[doc];                                &nbsp   Return (float) freq/(float) maxfreq);       }         Private float getinversedocumentfrequency (int term)         {            int DF=_DOCFR eq[term];            return Log ((float) (_numdocs)/(float) DF);      &NBSP ; }         private float computetermweight (int term, int doc)         {  & nbsp         float tf=gettermfrequency (term, doc);            float Idf=geti Nversedocumentfrequency (term);            return TF * idf;       }&NBSP ;               private  float[] Gettermvector (int doc)       &NB Sp {            float[] w=new float[_numterms];            for ( Inti=0; i < _numterms; i++)                                 &NBSP ;                          w[i]=_termweight[i][doc];                                        return w;       }         public float getsimilarity T doc_i, int doc_j)         {            float[] Vector1=gettermvector (d oc_i);            float[] Vector2=gettermvector (doc_j);         &N Bsp   Return termvector.computecosinesimilarity (Vector1, Vector2);        }    & nbsp           Private IDictionary getwordfrequency (string input)       &NBSp {            string convertedinput=input. ToLower ();                              and nbsp Tokeniser tokenizer=new Tokeniser ();            string[] Words=tokenizer. Partition (Convertedinput);                        Array.Sort (words);                        string[] distinctwords=getdistinctwords (words);                                 &NBSP ;   IDictionary result=new Hashtable ();            for (int i=0; i < Distinctwords.len Gth i++)             {                Object tmp;  & nbsp             Tmp=counTwords (Distinctwords[i], words);                result[distinctwords[i]]=tmp;& nbsp                          }      &NBS P                 return result;       }     &NB Sp                                  Private string[] Getdistinctwords (string[] input)         {                           if (input = = null)                            return new string[0];                        else            {                ARRaylist list=new ArrayList ();                        &NB Sp       for (int i=0; i < input. Length; i++)                     if (!list. Contains (Input[i])//N-gram similarity?                                  &NBS P     list. ADD (Input[i]);                            &NBSP ;   Return Tokeniser.arraylisttoarray (list);           }       }&N Bsp                        private int countwords (Strin G Word, string[] words)         {            int itemidx=array.binarysear CH (words, Word);          &NBsp;             if (Itemidx > 0)              &NB Sp             while (Itemidx > 0 && words[itemidx]. Equals (word))                              &NBS P     itemidx--;                                  &NBS P                 int count=0;            while ( Itemidx < words. Length && itemidx >= 0)             {            &NB Sp   if (Words[itemidx]. Equals (word)) count++;                                                itemidx++;  &nbsP             if (Itemidx < words. Length)                                &NB Sp   if (!words[itemidx]. Equals (word)) break;                                  &NBS p;            }                  &NB Sp     return count;       }                  &NB Sp }  disadvantage    because it is possible that the characteristic of an article to a quantifier is so high that the whole vector dimension is so expensive that it is not suitable for the calculation of large data volume. The main idea of  simhash principle   algorithm is to reduce dimension and map high dimensional eigenvector into a f-bit fingerprint (fingerprint) by comparing the f-bit fingerprint of two articles Hamming Distance to determine whether the article is repetitive or highly approximate. Since each article we can calculate in advance Hamming distance to save, then directly through the Hamming distance to calculate, so the speed is very fast for big data calculation.  google is based on this algorithm to achieve the Web page file check weight. We assume the following three paragraphs of text:  1,the cat sat on the mat 2,the cat sat on a mat 3,we all scream How does the for ice cream  implement this hash algorithm? Take the above three texts as an example, the whole process can be divided into the following six steps:  1, select the number of Simhash, please consider the cost of storage and the size of the data set, such as 32-bit  2, simhash you initialize 0 3, extract the original text features , generally use a variety of word segmentation methods. For example, "The cat sat on the mat" uses 22 participle to get the following results: {"th", "he", "E", "C", "Ca", "at", "T", "s", "sa", "O", "on", "n", "T" , "M", "Ma"} 4, uses the traditional 32-bit hash function to calculate the hashcode of each word, for example: "th". Hash = -502157718  hash = -369049682,...... 5 , each word hashcode each bit, if the bit is 1, then simhash the corresponding bit value plus 1, otherwise minus 1 6, to the last 32-bit simhash, if the bit is greater than 1, then set to 1;

. NET text similarity algorithm cosine theorem and Simhash analysis and application

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.