Statement:
Algorithm Declaration for selecting feature words by category
Vector < Pair < String , Double > > Localdffeatureselectionforperclass (Dictionary & Mymap, contingency & Contingencytable, String Classlabel ); // The local DF method sorts each word in each category.
Void Dffeatureselection (vector < String > Classlabels, Dictionary & Mymap, contingency & Contingencytable, Int N, Char * Address ); // Call the local DF Feature Word Selection Function
Function implementation:
For each word in a dictionary, count the number of times it appears in a category, and sort by word frequency from large to small
/* **************************************** ****************************** */
/* Type-based DF Feature Word Selection */
/* **************************************** ****************************** */
Vector < Pair < String , Double > > Preprocess: localdffeatureselectionforperclass (Dictionary & Mymap, contingency & Contingencytable, String Classlabel)
{
// Int finalkeywordscount = 0; // Calculate the total number of keywords
Clock_t start, finish;
Double Totaltime;
Start = Clock ();
Vector < Pair < String , Double > > Dfinfo;
For (Map < String , Vector < Pair < Int , Int >>> : Iterator it = Mymap. Begin (); it ! = Mymap. End (); ++ It)
{
Pair < String , String > Compoundkey = Make_pair (it -> First, classlabel );
Double Classcount = ( Double ) Contingencytable [compoundkey]. first;
Dfinfo. push_back (make_pair (it -> First, classcount ));
}
Stable_sort (dfinfo. Begin (), dfinfo. End (), islarger );
Finish = Clock ();
Totaltime = ( Double ) (Finish - Start) / Clocks_per_sec;
Cout < " Category " < Classlabel < " The selected feature words are shared. " < Totaltime < Endl;
ReturnDfinfo;
}
DF Feature Word selection method:
Code
/* **************************************** ****************************** */
/* DF Feature Word Selection Method */
/* **************************************** ****************************** */
Void Preprocess: dffeatureselection (vector < String > Classlabels, Dictionary & Mymap, contingency & Contingencytable, Int N, Char * Address)
{
Clock_t start, finish;
Double Totaltime;
Int Totaltraingingcorpus = Endindex - Beginindex + 1 ; // Training corpusArticleQuantity
Set < String > Finalkeywords; // Store finally selected feature words
Vector < Pair < String , Double > Dfinfo;
Start = Clock ();
For (Vector < String > : Iterator it = Classlabels. Begin (); it ! = Classlabels. End (); it ++ )
{
// Number of articles in a certain category in the training corpus
Int N_subclasscnt = Getcategorizationnum ( * It, " Trainingcorpus " );
// Threshold determines how many feature words are selected for each category
Int Threshold = N_subclasscnt * N / Totaltraingingcorpus;
Dfinfo = Localdffeatureselectionforperclass (mymap, contingencytable, * It );
For (Vector < Pair < String , Double > > : Size_type J = 0 ; J < Threshold; j ++ )
{
Finalkeywords. insert (dfinfo [J]. First );
}
Dfinfo. Clear ();
}
Ofstream OUTFILE (Address );
Int Finalkeywordscount = Finalkeywords. Size ();
For ( Set < String > : Iterator it = Finalkeywords. Begin (); it ! = Finalkeywords. End (); it ++ )
{
OUTFILE <* It < Endl;
}
OUTFILE. Close ();
Cout < " Finally, a total of feature words are selected. " < Finalkeywordscount < Endl;
Finish = Clock ();
Totaltime = ( Double ) (Finish - Start) / Clocks_per_sec;
Cout < " A total of feature words are selected. " < Totaltime < Endl;
}
Main function call:
Code
P. loaddictionary (mymap, " F: \ finallyliuyu \ dict. dat " );
P. loadcontingencytable (contingenytable, " F: \ finallyliuyu \ contingency. dat " );
P. dffeatureselection (labels, mymap, contingenytable, 2000 , " F: \ finallyliuyu \ keywords. dat " );