Input example: s1= "Computational linguistics course is interesting";
Definition: Maximum word length maxlen = 5;s2= ""; delimiter = "/";
Suppose there is a thesaurus: ..., computational linguistics, course, meaning, ... ;
The maximum inverse matching segmentation algorithm process is as follows:
(1) s2= ""; S1 is not empty, remove the candidate substring from the right of S1 w= "course interesting";
(2) Look up the word list, W is not in the glossary, will be the left one of the words removed, get w= "Cheng Fun";
(3) Look up the word list, W is not in the glossary, will be the left one of the words removed, get w= "interesting";
(4) Look up the word list, W is not in the glossary, will be the left one of the words removed, get w= "meaning"
(5) Look up the word list, "meaning" in the glossary, add W to S2, s2= "meaning/", and remove W from S1, at this time s1= "Computational linguistics course has";
(6) S1 is not empty, so from the S1 left to remove the candidate substring w= "speech learning course has";
(7) Look up the word list, W is not in the glossary, will be the left one of the words removed, get w= "learning course has";
(8) Look up the word list, W is not in the glossary, will be the left one of the words removed, get w= "course has";
(9) Look up the word list, W is not in the glossary, will be the left one of the words removed, get w= "Cheng";
(10) Look up the glossary, W is not in the word list, will be the left one of W minus, get w= "have", this w is the word, will add W to S2, s2= "/have/meaning", and will be removed from the S1, s1= "Computational Linguistics course";
(one) The S1 is not empty, so the candidate substring w= "Linguistics course" is taken from the left of S1;
(12) Look up the word list, W is not in the glossary, will be the left one of the words removed, get w= "speech learning course";
(13) Look up the word list, W is not in the glossary, will be the left one of the words removed, get w= "learning course";
(14) Look up the word list, W is not in the glossary, will be the left one of the words removed, get w= "course";
(15) Look up the word list, "meaning" in the glossary, add W to S2, s2= "course/have/meaning/", and remove W from S1, at this time s1= "computational linguistics";
() S1 is not empty, so the candidate substring w= "computational linguistics" is removed from the left of S1;
(17) Glossary, "Computational Linguistics" in the glossary, add W to S2, s2= "Computational linguistics/curriculum/have/meaning/", and will be removed from the S1, at this time s1= "";
() S1 is empty, output S2 as a result of the word segmentation, the end of the word segmentation process.
//participle. CPP: Defines the entry point of the console application. //#include"stdafx.h"#include<iostream>#include<fstream>#include<Set>#include<string>#include<sstream>#include<algorithm>using namespacestd;intMain () {Set<string> dicset;//Create a collection of dictionaries to search for and see if a word is in a dictionary setIfstream DIC ("D:\\file\\word_freq_list.txt");//Open DictionaryIfstream Test ("D:\\file\\pku_test.txt");//Open the text you want to participleOfstream out("D:\\file\\word_output.txt");//this is used to store the text after the word stringDicline,testline;//Dicline is used to read a line of a dictionary, testline to read a line of text to be participle if(!dic| |! Test| |! out)//test whether the file is open{Cerr<<"Open file failed!"; return 0; } while(Getline (DIC, Dicline))//read a line of the dictionary { stringWord =""; Istringstream ISS (dicline);//Place a line of the dictionary in a Istringstream object for(inti =0; I <=1; ++i)//put the words or words in the middle of the line in the Dicset, because we only need the word or word in the dictionary{ISS>>Word; if(1==i) Dicset.insert (word); } } while(Getline (test, Testline))//read a line of Test text{size_t len=0;//record the total length of the resulting participlesize_t pos =0;//record the length of a participle inti = testline.size ()-Ten;//the offset used to get the character string while(true) { stringWord; size_t J=0; if(I <0)//If I is less than 0, the length of the read string is less than 10{Word= Testline.substr (0,Ten+i); } ElseWord= Testline.substr (I,Ten);//reads a substring of length 10 from the start of this line for(; J < Word.size (); j+=2) { stringcharacter = Word.substr (j);//string that evaluates to this substring if(character.size () = =2|| (Find (Dicset.begin (), Dicset.end (), character)! =dicset.end ())) { //If Word is a word in a dictionary, or word has only one word, you should use Word as a word breaker out<< character <<"/"; POS=character.size (); Len+ = pos;//The total length of the participle that records this linei = I-pos;//make the position of I smaller, forward indent Break;//jump out of this loop and look for the next word in this line } } if(len = testline.size ())//if the word length of this line is equal to the length of the string, the end of this line of participle{//jump out of this loop and make the next line of participle out<<Endl; Break; }}} dic.close (); Test.close (); out. Close (); return 0;}
Maximum inverse matching segmentation algorithm for natural language processing