RMM Segmentation Algorithm Class

Source: Internet
Author: User
Tags explode ord
RMM Segmentation Algorithm Class
  1. RMM segmentation algorithm
  2. Class splitword{
  3. var $TagDic = Array ();
  4. var $RankDic = Array ();
  5. var $SourceStr = ';
  6. var $ResultStr = ';
  7. var $SplitChar = '; Separator
  8. var $SplitLen = 4; Reserved word length
  9. var $MaxLen = 7; Dictionary maximum Chinese text, where the value is the largest index of a byte array
  10. var $MinLen = 3; Minimum Chinese text, where the value is the maximum index of the byte array
  11. function Splitword () {
  12. $this->__construct ();
  13. }
  14. function __construct () {
  15. Advanced participle, pre-loaded dictionary to mention the high speed of participle
  16. $dicfile = DirName (__file__). " /ppldic.csv ";
  17. $fp = fopen ($dicfile, ' r ');//Read words in a thesaurus
  18. while ($line = fgets ($FP, 256)) {
  19. $ws = Explode (', $line);//split the words in the thesaurus
  20. $this->tagdic[$ws [0]] = $ws [1];
  21. $this->rankdic[strlen ($ws [0]) [$WS [0]] = $WS [2];
  22. }
  23. Fclose ($FP);//Close the thesaurus file
  24. }
  25. Analysis and release resources
  26. function Clear () {
  27. @fclose ($this->quickdic);
  28. }
  29. Set source string
  30. function SetSource ($STR) {
  31. $this->sourcestr = $this->updatestr ($STR);
  32. $this->resultstr = "";
  33. }
  34. Check if the string does not exist in Chinese
  35. function NOTGBK ($STR)
  36. {
  37. if ($str = = "") return "";
  38. if (Ord ($str [0]) >0x80) return false;
  39. else return true;
  40. }
  41. RMM segmentation algorithm
  42. function splitrmm ($str = "") {
  43. if ($str! = "") $this->setsource ($STR);
  44. if ($this->sourcestr== "") return "";
  45. $this->sourcestr = $this->updatestr ($this->sourcestr);
  46. $spwords = Explode ("", $this->sourcestr);
  47. $spLen = count ($spwords);
  48. $SPC = $this->splitchar;
  49. for ($i = ($spLen-1); $i >=0; $i-) {
  50. if ($spwords [$i]== "") continue;
  51. if ($this->notgbk ($spwords [$i]) {
  52. if (Preg_match ("/[^0-9\.\+\-]/", $spwords [$i]))
  53. {$this->resultstr = $spwords [$i]. $spc. $this->resultstr;}
  54. Else
  55. {
  56. $nextword = "";
  57. @ $nextword = substr ($this->resultstr,0,strpos ($this->resultstr, ""));
  58. }
  59. }
  60. Else
  61. {
  62. $c = $spwords [$i][0]. $spwords [$i][1];
  63. $n = Hexdec (Bin2Hex ($c));
  64. if (strlen ($spwords [$i]) <= $this->splitlen)
  65. {
  66. }
  67. Else
  68. {
  69. $this->resultstr = $this->runrmm ($spwords [$i]). $spc. $this->resultstr;
  70. }
  71. }
  72. }
  73. return $this->resultstr;
  74. }
  75. Decomposition of all Chinese strings by inverse matching method
  76. function RUNRMM ($STR) {
  77. $SPC = $this->splitchar;
  78. $spLen = strlen ($STR);
  79. $RSSTR = "";
  80. $okWord = "";
  81. $tmpWord = "";
  82. $WordArray = Array ();
  83. Inverse dictionary Matching
  84. for ($i = ($spLen-1); $i >=0;) {
  85. When I reach the smallest possible word
  86. if ($i <= $this->minlen) {
  87. if ($i ==1) {
  88. $WordArray [] = substr ($str, 0,2);
  89. }else
  90. {
  91. $w = substr ($str, 0, $this->minlen+1);
  92. if ($this->isword ($w)) {
  93. $WordArray [] = $w;
  94. }else{
  95. $WordArray [] = substr ($str, 2,2);
  96. $WordArray [] = substr ($str, 0,2);
  97. }
  98. }
  99. $i =-1; Break
  100. }
  101. Analyze the situation when the minimum word is above
  102. if ($i >= $this->maxlen) $maxPos = $this->maxlen;
  103. else $maxPos = $i;
  104. $isMatch = false;
  105. for ($j = $maxPos; $j >=0; $j = $j-2) {
  106. $w = substr ($str, $i-$j, $j + 1);
  107. if ($this->isword ($w)) {
  108. $WordArray [] = $w;
  109. $i = $i-$j-1;
  110. $isMatch = true;
  111. Break
  112. }
  113. }
  114. }
  115. $RSSTR = $this->otherword ($WordArray);
  116. return $rsStr;
  117. }
  118. function Otherword ($WordArray) {
  119. $wlen = count ($WordArray) -1;//counts the number of elements in the array
  120. $RSSTR = "";//Initialize variable
  121. $SPC = $this->splitchar;
  122. for ($i = $wlen; $i >=0; $i--)
  123. {
  124. $rsStr. = $spc. $WordArray [$i]. ",";//splitting the array into comma
  125. }
  126. Returns the result of this paragraph participle
  127. $RSSTR = Preg_replace ("/^". $spc. " /",", ", $RSSTR);
  128. return $rsStr;
  129. }
  130. Determine if there is a word in the dictionary
  131. function Isword ($okWord) {
  132. $slen = strlen ($okWord);
  133. if ($slen > $this->maxlen) return false;
  134. else return isset ($this->rankdic[$slen [$okWord]);
  135. }
  136. Collation of strings (punctuation, Chinese and English mixed, etc. preliminary processing)
  137. function Updatestr ($STR) {
  138. $SPC = $this->splitchar;
  139. $slen = strlen ($STR);
  140. if ($slen ==0) return ';
  141. $okstr = ";
  142. $prechar = 0; 0-Blank 1-English-Chinese 3-symbol
  143. for ($i =0; $i < $slen; $i + +) {
  144. if (Ord ($str [$i]) < 0X81) {
  145. Blank symbols in English
  146. if (Ord ($str [$i]) < 33) {
  147. if ($prechar!=0&& $str [$i]!= "\ r" && $str [$i]!= "\ n") $okstr. = $SPC;
  148. $prechar = 0;
  149. Continue
  150. }else if (Preg_match ("/[^0-9a-za-z@\.%#:\\&_-]/", $str [$i])) {
  151. if ($prechar ==0) {$okstr. = $str [$i]; $prechar = 3;}
  152. else{$okstr. = $spc. $str [$i]; $prechar = 3;}
  153. }else{
  154. if ($prechar ==2| | $prechar ==3)
  155. {$okstr. = $spc. $str [$i]; $prechar = 1;}
  156. Else
  157. {
  158. if (Preg_match ("/@#%:/", $str [$i])) {$okstr. = $str [$i]; $prechar = 3; }
  159. else {$okstr. = $str [$i]; $prechar = 1;}
  160. }
  161. }
  162. }
  163. else{
  164. If the previous character is non-Chinese and non-whitespace, add a space
  165. if ($prechar!=0 && $prechar!=2) $okstr. = $SPC;
  166. If Chinese characters
  167. if (Isset ($str [$i +1])) {
  168. $c = $str [$i]. $str [$i +1];
  169. $n = Hexdec (Bin2Hex ($c));
  170. if ($n <0xa13f && $n > 0xaa40) {
  171. if ($prechar!=0) $okstr. = $spc. $c;
  172. else $okstr. = $c;
  173. $prechar = 3;
  174. }
  175. else{
  176. $okstr. = $c;
  177. $prechar = 2;
  178. }
  179. $i + +;
  180. }
  181. }
  182. }
  183. return $okstr;
  184. }
  185. }
  186. Call
  187. $split =new Splitword ();
  188. echo $split->splitrmm ("PHP Search Technology");
  189. Note The format of the Ppldic.csv dictionary is word + space + number +n
Copy Code
  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.