Note the use of variable control classification in code Uni Bi Tri
And began to think of a two-layer cycle, cycle to find words, small loops and then the number of lines, this will be too slow
So draw on the idea of source code, is through a large cycle, in the dictionary record each word appears how many words, while using the find mark this sentence has no callout 1 or 0
WORDPOS1 is equivalent to recording the N11 of each word, and the N.1 is the same for each word, which is the total number of occurrences of the Mark 1.
WORDNEG1 records the N01 of each word, the target word is not marked 1, the same for each word n.0 is the same, that is, the total number of non-marked 0
This writing idea also tells me to get the data but all records may be more complex, record indirect data can also help to get the final result
usingSystem;usingSystem.Collections.Generic;usingSystem.Linq;usingSystem.Text;usingSystem.Threading.Tasks;usingSystem.IO;namespaceconsoleapplication2{classProgram {Staticdictionary<string,int> Word =Newdictionary<string,int>(); Staticdictionary<string,int> WORDPOS1 =Newdictionary<string,int>(); Staticdictionary<string,int> WORDNEG1 =Newdictionary<string,int>(); Staticdictionary<string,Double> word_x=Newdictionary<string,Double>(); Staticdictionary<string,Double> WORDTF =Newdictionary<string,Double>(); Staticdictionary<string,Double> WORDIDF =Newdictionary<string,Double>(); Staticdictionary<string,Double> WORDTFIDF =Newdictionary<string,Double>(); Static intPosfirst =0; Static intNegfirst =0; Public enumGram {Unigram=0, Bigram=1, Trigram=2, } Static voidCreate (Gram gram,streamreader sr) {stringLine ; intGramnum =Convert.ToInt32 (gram); while(LINE=SR. ReadLine ())! =NULL) { string[] Take = line. Split ('\ t'); intfind1=0; if(take.) Length>2&& take[2]=="1") {Find1=1; Posfirst++; } Else{Negfirst++; } for(intK =0; K <2; k++) { for(inti =0; I < take[k]. Length-gramnum; i++) { stringtemp =string. Empty; for(intj =0; J < Gramnum +1; J + +) {Temp+ = Take[k][i +J]; } if(!word. ContainsKey (temp))//If this word appears first in Word{Word. ADD (temp,1); Wordpos1. ADD (temp,0); Wordneg1. ADD (temp,0); if(Find1 = =1) {Wordpos1[temp]++; } Else //if (Find1 = = 0){Wordneg1[temp]++; } } Else if(Word. ContainsKey (temp)) {Word[temp]++; if(Find1 = =1) {Wordpos1[temp]++; } Else if(Find1 = =0) {Wordneg1[temp]++; } } } } } varITE = fromTaeinchWordpos1 byTae. Value DescendingSelectTae; varFirst = ite. First ();//find the maximum frequency that appears in the Learning class foreach(keyvaluepair<string,int> Iteminchword) { DoubleN11 = Wordpos1[item. Key];//C # does not use transformations to assign integers directly to double DoubleN10 =Wordneg1[item. Key]; DoubleN1 =Posfirst; DoubleN0 =Negfirst; DoubleN01 = N1-N11; DoubleN00 = N0-N10; DoubleChi = (N11 + n10 + n01 + n00) * (N11 * N00-N10 * n01) * (N11 * N00-N10 * N01)/((N11 + n01) * (N11 + n10) * (N10 + n0) 0) * (N01 +n00)); Word_x.add (item. Key, CHI); Doubletf = N11/First . Value; WORDTF. ADD (item. Key, TF); DoubleIDF = Math.Log (n1/(N11 +1)); WORDIDF. ADD (item. Key, IDF); DoubleTFIDF = TF *IDF; WORDTFIDF. ADD (item. Key, TFIDF); } } Static voidMain (string[] args) { //because the variable name is too much to write, so the program is run separately, the original code to change three cases into a function input controllable more clever//Uni caseStreamReader sr =NewStreamReader ("C:\\users\\v-yinqhe\\desktop\\task\\classify\\tmp2.txt"); Create (Gram.unigram, SR); //Sort by chi-square varItems = fromPairinchWord_x byPair. Value DescendingSelectpair; intn =0; StreamWriter SW=NewStreamWriter (@"C:\\users\\v-yinqhe\\desktop\\task\\classify\\result1.txt"); Sw. WriteLine ("string\t chi-square \ t tf \ nthe IDF \ t tf IDF \ t \ n"); foreach(keyvaluepair<string,Double> Pairinchitems) {SW. WriteLine ("{0}:{1},{2},{3},{4} \ n", pair. Key, pair. Value,wordtf[pair. Key],wordidf[pair. Key],wordtfidf[pair. Key]); N++; if(N > +) { Break; } } /* Bi case StreamReader sr = new StreamReader ("C:\\users\\v-yinqhe\\desktop\\task\\classify\\tmp2.tx T "); Create (Gram.bigram, SR); var ite = from Tae in wordpos1 by Tae. Value Descending Select Tae; var first = ite. First (); Find the largest frequency foreach (Keyvaluepair<string, int> item in Word) that appears in the Learning class {double N11 = Wordpos1[item. Key]; C # does not use transformations to assign integers directly to double double N10 = Wordneg1[item. Key]; Double n1 = Posfirst; Double n0 = Negfirst; Double N01 = n1-n11; Double n00 = n0-n10; Double Chi = (N11 + n10 + n01 + n00) * (N11 * N00-N10 * n01) * (N11 * N00-N10 * N01)/((N11 + n01) * (N11 + n10) * (n + n00) * (N01 + n00)); Word_x.add (item. Key, CHI); Double tf = N11/first. Value; WORDTF. ADD (item. Key, TF); Double IDF = Math.Log (n1/(N11 + 1)); WORDIDF. ADD (item. Key, IDF); Double TFIDF = tf * IDF; WORDTFIDF. ADD (item. Key, TFIDF); }//By Chi-square sort var items = from pair in word_x order by pair. Value Descending Select pair; int n = 0; StreamWriter SW = new StreamWriter (@ "C:\\users\\v-yinqhe\\desktop\\task\\classify\\result2.txt"); Sw. WriteLine ("string\t chi-square \ t \ n tf \ t \ IDF \t\t TF IDF \ t \t\n"); foreach (keyvaluepair<string, double> pair in items) {sw. WriteLine ("{0}:{1}, {2}, {3}, {4} \ n", pair.) Key, pair. Value, Wordtf[pair. Key], Wordidf[pair. Key], Wordtfidf[pair. Key]); n++; if (n > 10000) {break; }}//tri case StreamReader sr = new StreamReader ("C:\\users\\v-yinqhe\\desktop\\task\\cl Assify\\tmp2.txt "); Create (Gram.trigram, SR); var ite = From Tae in wordpos1 to Tae. Value Descending Select Tae; var first = ite. First (); Find the largest frequency foreach (Keyvaluepair<string, int> item in Word) that appears in the Learning class {double N11 = Wordpos1[item. Key]; C # does not use transformations to assign integers directly to double double N10 = Wordneg1[item. Key]; Double n1 = Posfirst; Double n0 = Negfirst; Double N01 = n1-n11; Double n00 = n0-n10; Double Chi = (N11 + n10 + n01 + n00) * (N11 * N00-N10 * n01) * (N11 * N00-N10 * N01)/((N11 + n01) * (N11 + n10) * (n + n00) * (N01 + n00)); Word_x.add (item. Key, CHI); Double tf = N11/first. Value; WORDTF. ADD (item. Key, TF); Double IDF = Math.Log (n1/(N11 + 1)); WORDIDF. ADD (item. Key, IDF); Double TFIDF = tf * IDF; WORDTFIDF. ADD (item. Key, TFIDF); }//Sort by Chi-square var items =From pair in word_x the pair. Value Descending Select pair; int n = 0; StreamWriter SW = new StreamWriter (@ "C:\\users\\v-yinqhe\\desktop\\task\\classify\\result3.txt"); Sw. WriteLine ("string\t chi-square \ t \ n tf \ t \ IDF \t\t TF IDF \ t \t\n"); foreach (keyvaluepair<string, double> pair in items) {sw. WriteLine ("{0}:{1}, {2}, {3}, {4} \ n", pair.) Key, pair. Value, Wordtf[pair. Key], Wordidf[pair. Key], Wordtfidf[pair. Key]); n++; if (n > 10000) {break; } }*/ } }}
Using chi-square to analyze a word ' s relationship with great parts