RMM Segmentation Algorithm Class
- RMM segmentation algorithm
- Class splitword{
- var $TagDic = Array ();
- var $RankDic = Array ();
- var $SourceStr = ';
- var $ResultStr = ';
- var $SplitChar = '; Separator
- var $SplitLen = 4; Reserved word length
- var $MaxLen = 7; Dictionary maximum Chinese text, where the value is the largest index of a byte array
- var $MinLen = 3; Minimum Chinese text, where the value is the maximum index of the byte array
- function Splitword () {
- $this->__construct ();
- }
- function __construct () {
- Advanced participle, pre-loaded dictionary to mention the high speed of participle
- $dicfile = DirName (__file__). " /ppldic.csv ";
- $fp = fopen ($dicfile, ' r ');//Read words in a thesaurus
- while ($line = fgets ($FP, 256)) {
- $ws = Explode (', $line);//split the words in the thesaurus
- $this->tagdic[$ws [0]] = $ws [1];
- $this->rankdic[strlen ($ws [0]) [$WS [0]] = $WS [2];
- }
- Fclose ($FP);//Close the thesaurus file
- }
- Analysis and release resources
- function Clear () {
- @fclose ($this->quickdic);
- }
- Set source string
- function SetSource ($STR) {
- $this->sourcestr = $this->updatestr ($STR);
- $this->resultstr = "";
- }
- Check if the string does not exist in Chinese
- function NOTGBK ($STR)
- {
- if ($str = = "") return "";
- if (Ord ($str [0]) >0x80) return false;
- else return true;
- }
- RMM segmentation algorithm
- function splitrmm ($str = "") {
- if ($str! = "") $this->setsource ($STR);
- if ($this->sourcestr== "") return "";
- $this->sourcestr = $this->updatestr ($this->sourcestr);
- $spwords = Explode ("", $this->sourcestr);
- $spLen = count ($spwords);
- $SPC = $this->splitchar;
- for ($i = ($spLen-1); $i >=0; $i-) {
- if ($spwords [$i]== "") continue;
- if ($this->notgbk ($spwords [$i]) {
- if (Preg_match ("/[^0-9\.\+\-]/", $spwords [$i]))
- {$this->resultstr = $spwords [$i]. $spc. $this->resultstr;}
- Else
- {
- $nextword = "";
- @ $nextword = substr ($this->resultstr,0,strpos ($this->resultstr, ""));
- }
- }
- Else
- {
- $c = $spwords [$i][0]. $spwords [$i][1];
- $n = Hexdec (Bin2Hex ($c));
- if (strlen ($spwords [$i]) <= $this->splitlen)
- {
- }
- Else
- {
- $this->resultstr = $this->runrmm ($spwords [$i]). $spc. $this->resultstr;
- }
- }
- }
- return $this->resultstr;
- }
- Decomposition of all Chinese strings by inverse matching method
- function RUNRMM ($STR) {
- $SPC = $this->splitchar;
- $spLen = strlen ($STR);
- $RSSTR = "";
- $okWord = "";
- $tmpWord = "";
- $WordArray = Array ();
- Inverse dictionary Matching
- for ($i = ($spLen-1); $i >=0;) {
- When I reach the smallest possible word
- if ($i <= $this->minlen) {
- if ($i ==1) {
- $WordArray [] = substr ($str, 0,2);
- }else
- {
- $w = substr ($str, 0, $this->minlen+1);
- if ($this->isword ($w)) {
- $WordArray [] = $w;
- }else{
- $WordArray [] = substr ($str, 2,2);
- $WordArray [] = substr ($str, 0,2);
- }
- }
- $i =-1; Break
- }
- Analyze the situation when the minimum word is above
- if ($i >= $this->maxlen) $maxPos = $this->maxlen;
- else $maxPos = $i;
- $isMatch = false;
- for ($j = $maxPos; $j >=0; $j = $j-2) {
- $w = substr ($str, $i-$j, $j + 1);
- if ($this->isword ($w)) {
- $WordArray [] = $w;
- $i = $i-$j-1;
- $isMatch = true;
- Break
- }
- }
- }
- $RSSTR = $this->otherword ($WordArray);
- return $rsStr;
- }
- function Otherword ($WordArray) {
- $wlen = count ($WordArray) -1;//counts the number of elements in the array
- $RSSTR = "";//Initialize variable
- $SPC = $this->splitchar;
- for ($i = $wlen; $i >=0; $i--)
- {
- $rsStr. = $spc. $WordArray [$i]. ",";//splitting the array into comma
- }
- Returns the result of this paragraph participle
- $RSSTR = Preg_replace ("/^". $spc. " /",", ", $RSSTR);
- return $rsStr;
- }
- Determine if there is a word in the dictionary
- function Isword ($okWord) {
- $slen = strlen ($okWord);
- if ($slen > $this->maxlen) return false;
- else return isset ($this->rankdic[$slen [$okWord]);
- }
- Collation of strings (punctuation, Chinese and English mixed, etc. preliminary processing)
- function Updatestr ($STR) {
- $SPC = $this->splitchar;
- $slen = strlen ($STR);
- if ($slen ==0) return ';
- $okstr = ";
- $prechar = 0; 0-Blank 1-English-Chinese 3-symbol
- for ($i =0; $i < $slen; $i + +) {
- if (Ord ($str [$i]) < 0X81) {
- Blank symbols in English
- if (Ord ($str [$i]) < 33) {
- if ($prechar!=0&& $str [$i]!= "\ r" && $str [$i]!= "\ n") $okstr. = $SPC;
- $prechar = 0;
- Continue
- }else if (Preg_match ("/[^0-9a-za-z@\.%#:\\&_-]/", $str [$i])) {
- if ($prechar ==0) {$okstr. = $str [$i]; $prechar = 3;}
- else{$okstr. = $spc. $str [$i]; $prechar = 3;}
- }else{
- if ($prechar ==2| | $prechar ==3)
- {$okstr. = $spc. $str [$i]; $prechar = 1;}
- Else
- {
- if (Preg_match ("/@#%:/", $str [$i])) {$okstr. = $str [$i]; $prechar = 3; }
- else {$okstr. = $str [$i]; $prechar = 1;}
- }
- }
- }
- else{
- If the previous character is non-Chinese and non-whitespace, add a space
- if ($prechar!=0 && $prechar!=2) $okstr. = $SPC;
- If Chinese characters
- if (Isset ($str [$i +1])) {
- $c = $str [$i]. $str [$i +1];
- $n = Hexdec (Bin2Hex ($c));
- if ($n <0xa13f && $n > 0xaa40) {
- if ($prechar!=0) $okstr. = $spc. $c;
- else $okstr. = $c;
- $prechar = 3;
- }
- else{
- $okstr. = $c;
- $prechar = 2;
- }
- $i + +;
- }
- }
- }
- return $okstr;
- }
- }
- Call
- $split =new Splitword ();
- echo $split->splitrmm ("PHP Search Technology");
- Note The format of the Ppldic.csv dictionary is word + space + number +n
Copy Code |