RMM word segmentation algorithm
- // RMM word segmentation algorithm
- Class SplitWord {
- Var $ TagDic = Array ();
- Var $ RankDic = Array ();
- Var $ SourceStr = '';
- Var $ ResultStr = '';
- Var $ SplitChar = ''; // delimiter
- Var $ SplitLen = 4; // reserved word length
- Var $ MaxLen = 7; // The maximum text in the dictionary. The value here is the maximum index of the byte array.
- Var $ MinLen = 3; // The smallest Chinese text. The value here is the maximum index of the byte array.
- Function SplitWord (){
- $ This->__ construct ();
- }
-
- Function _ construct (){
- // Advanced word segmentation, preloaded into the dictionary to improve word segmentation speed
- $ Dicfile = dirname (_ FILE _). "/ppldic.csv ";
- $ Fp = fopen ($ dicfile, 'r'); // read words in the dictionary
- While ($ line = fgets ($ fp, 256 )){
- $ Ws = explode ('', $ line); // split words in the dictionary
- $ This-> TagDic [$ ws [0] = $ ws [1];
- $ This-> RankDic [strlen ($ ws [0])] [$ ws [0] = $ ws [2];
- }
- Fclose ($ fp); // Close the dictionary file
- }
-
- // Analyze and release resources
- Function Clear (){
- @ Fclose ($ this-> QuickDic );
- }
-
- // Set the source string
- Function SetSource ($ str ){
- $ This-> SourceStr = $ this-> UpdateStr ($ str );
- $ This-> ResultStr = "";
- }
-
- // Check whether the string does not contain Chinese characters
- Function NotGBK ($ str)
- {
- If ($ str = "") return "";
- If (ord ($ str [0])> 0x80) return false;
- Else return true;
- }
- // RMM word segmentation algorithm
- Function SplitRMM ($ str = ""){
- If ($ str! = "") $ This-> SetSource ($ str );
- If ($ this-> SourceStr = "") return "";
- $ This-> SourceStr = $ this-> UpdateStr ($ this-> SourceStr );
- $ Spwords = explode ("", $ this-> SourceStr );
- $ SpLen = count ($ spwords );
- $ Spc = $ this-> SplitChar;
- For ($ I = ($ spLen-1); $ I> = 0; $ I --){
- If ($ spwords [$ I] = "") continue;
- If ($ this-> NotGBK ($ spwords [$ I]) {
- If (preg_match ("/[^ 0-9 \. \ + \-]/", $ spwords [$ I])
- {$ This-> ResultStr = $ spwords [$ I]. $ spc. $ this-> ResultStr ;}
- Else
- {
- $ Nextword = "";
- @ $ Nextword = substr ($ this-> ResultStr, 0, strpos ($ this-> ResultStr ,""));
- }
- }
- Else
- {
- $ C = $ spwords [$ I] [0]. $ spwords [$ I] [1];
- $ N = hexdec (bin2hex ($ c ));
- If (strlen ($ spwords [$ I]) <= $ this-> SplitLen)
- {
- }
- Else
- {
- $ This-> ResultStr = $ this-> RunRMM ($ spwords [$ I]). $ spc. $ this-> ResultStr;
- }
- }
- }
- Return $ this-> ResultStr;
- }
- // Reverse matching for all Chinese Strings
- Function RunRMM ($ str ){
- $ Spc = $ this-> SplitChar;
- $ SpLen = strlen ($ str );
- $ RsStr = "";
- $ OkWord = "";
- $ TmpWord = "";
- $ WordArray = Array ();
- // Reverse dictionary matching
- For ($ I = ($ spLen-1); $ I> = 0 ;){
- // When I reaches the minimum possible word
- If ($ I <= $ this-> MinLen ){
- If ($ I = 1 ){
- $ WordArray [] = substr ($ str, 0, 2 );
- } Else
- {
- $ W = substr ($ str, 0, $ this-> MinLen + 1 );
- If ($ this-> IsWord ($ w )){
- $ WordArray [] = $ w;
- } Else {
- $ WordArray [] = substr ($ str, 2, 2 );
- $ WordArray [] = substr ($ str, 0, 2 );
- }
- }
- $ I =-1; break;
- }
- // Analyze the situation above the minimum word
- If ($ I >=$ this-> MaxLen) $ maxPos = $ this-> MaxLen;
- Else $ maxPos = $ I;
- $ IsMatch = false;
- For ($ j = $ maxPos; $ j> = 0; $ j = $ J-2 ){
- $ W = substr ($ str, $ I-$ j, $ j + 1 );
- If ($ this-> IsWord ($ w )){
- $ WordArray [] = $ w;
- $ I = $ I-$ J-1;
- $ IsMatch = true;
- Break;
- }
- }
- }
- $ RsStr = $ this-> otherword ($ WordArray );
- Return $ rsStr;
- }
-
- Function otherword ($ WordArray ){
- $ Wlen = count ($ WordArray)-1; // calculates the number of elements in the array.
- $ RsStr = ""; // initialize the variable
- $ Spc = $ this-> SplitChar;
- For ($ I = $ wlen; $ I >=0; $ I --)
- {
- $ RsStr. = $ spc. $ WordArray [$ I]. ","; // split the array into a comma
- }
- // Returns the segmentation result of this segment.
- $ RsStr = preg_replace ("/^". $ spc. "/", $ rsStr );
- Return $ rsStr;
- }
-
- // Determine whether a word exists in the dictionary
- Function IsWord ($ okWord ){
- $ Slen = strlen ($ okWord );
- If ($ slen> $ this-> MaxLen) return false;
- Else return isset ($ this-> RankDic [$ slen] [$ okWord]);
- }
-
- // Sort strings (initial handling of punctuation marks, Chinese and English mixing)
- Function UpdateStr ($ str ){
- $ Spc = $ this-> SplitChar;
- $ Slen = strlen ($ str );
- If ($ slen = 0) return '';
- $ Okstr = '';
- $ Prechar = 0; // 0-Blank 1-English 2-Chinese 3-symbol
- For ($ I = 0; $ I <$ slen; $ I ++ ){
- If (ord ($ str [$ I]) <0x81 ){
- // Blank characters in English
- If (ord ($ str [$ I]) <33 ){
- If ($ prechar! = 0 & $ str [$ I]! = "\ R" & $ str [$ I]! = "\ N") $ okstr. = $ spc;
- $ Prechar = 0;
- Continue;
- } Else if (preg_match ("/[^ 0-9a-zA-Z @ \. % #: \ & _-]/", $ str [$ I]) {
- If ($ prechar = 0) {$ okstr. = $ str [$ I]; $ prechar = 3 ;}
- Else {$ okstr. = $ spc. $ str [$ I]; $ prechar = 3 ;}
- } Else {
- If ($ prechar = 2 | $ prechar = 3)
- {$ Okstr. = $ spc. $ str [$ I]; $ prechar = 1 ;}
- Else
- {
- If (preg_match ("/@ # %:/", $ str [$ I]) {$ okstr. = $ str [$ I]; $ prechar = 3 ;}
- Else {$ okstr. = $ str [$ I]; $ prechar = 1 ;}
- }
- }
- }
- Else {
- // If the previous character is not Chinese or non-space, add a space.
- If ($ prechar! = 0 & $ prechar! = 2) $ okstr. = $ spc;
- // If Chinese characters
- If (isset ($ str [$ I + 1]) {
- $ C = $ str [$ I]. $ str [$ I + 1];
-
- $ N = hexdec (bin2hex ($ c ));
- If ($ n <0xA13F & $ n> 0xAA40 ){
- If ($ prechar! = 0) $ okstr. = $ spc. $ c;
- Else $ okstr. = $ c;
- $ Prechar = 3;
- }
- Else {
- $ Okstr. = $ c;
- $ Prechar = 2;
- }
- $ I ++;
- }
- }
- }
- Return $ okstr;
- }
- }
- // Call
- $ Split = new SplitWord ();
- Echo $ split-> SplitRMM ("php search technology ");
- // Note that the ppldic.csv dictionary is in the format of words, spaces, numbers, and n.
|