? PhpRMM word segmentation algorithm classSplitWord {var $ TagDicArray (); var $ RankDicArray (); var $ SourceStr; var $ ResultStr; var $ SplitChar; separator var $ SplitLen4; reserved word length var $ MaxLen7; the maximum text in the dictionary. The value here is byte.
? Php // RMM word segmentation algorithm class SplitWord {var $ TagDic = Array (); var $ RankDic = Array (); var $ SourceStr = ''; var $ ResultStr = ''; var $ SplitChar = ''; // delimiter var $ SplitLen = 4; // reserved word length var $ MaxLen = 7; // The maximum text in the dictionary. The value here is byte.
// RMM Word Segmentation Algorithm
Class SplitWord {
Var $ TagDic = Array ();
Var $ RankDic = Array ();
Var $ SourceStr = '';
Var $ ResultStr = '';
Var $ SplitChar = ''; // delimiter
Var $ SplitLen = 4; // reserved word length
Var $ MaxLen = 7; // The maximum text in the dictionary. The value here is the maximum index of the byte array.
Var $ MinLen = 3; // the smallest Chinese text. The value here is the maximum index of the byte array.
Function SplitWord (){
$ This->__ construct ();
}
Function _ construct (){
// Advanced word segmentation, preloaded into the dictionary to improve word segmentation speed
$ Dicfile = dirname (_ FILE _). "/ppldic.csv ";
$ Fp = fopen ($ dicfile, 'R'); // read words in the dictionary
While ($ line = fgets ($ fp, 256 )){
$ Ws = explode ('', $ line); // split words in the dictionary
$ This-> TagDic [$ ws [0] = $ ws [1];
$ This-> RankDic [strlen ($ ws [0])] [$ ws [0] = $ ws [2]; //?
}
Fclose ($ fp); // close the dictionary file
}
// Release resources
Function Clear (){
@ Fclose ($ this-> QuickDic );
}
// Set the source string
Function SetSource ($ str ){
$ This-> SourceStr = $ this-> UpdateStr ($ str );
$ This-> ResultStr = "";
}
// Check whether the string does not contain Chinese Characters
Function NotGBK ($ str)
{
If ($ str = "") return "";
If (ord ($ str [0])> 0x80) return false;
Else return true;
}
// RMM Word Segmentation Algorithm
Function SplitRMM ($ str = ""){
If ($ str! = "") $ This-> SetSource ($ str );
If ($ this-> SourceStr = "") return "";
$ This-> SourceStr = $ this-> UpdateStr ($ this-> SourceStr );
$ Spwords = explode ("", $ this-> SourceStr );
$ SpLen = count ($ spwords );
$ Spc = $ this-> SplitChar;
For ($ I = ($ spLen-1); $ I> = 0; $ I --){
If ($ spwords [$ I] = "") continue;
If ($ this-> NotGBK ($ spwords [$ I]) {
If (ereg ("[^ 0-9 \. \ + \-]", $ spwords [$ I])
{$ This-> ResultStr = $ spwords [$ I]. $ spc. $ this-> ResultStr ;}
Else
{
$ Nextword = "";
@ $ Nextword = substr ($ this-> ResultStr, 0, strpos ($ this-> ResultStr ,""));
}
}
Else
{
$ C = $ spwords [$ I] [0]. $ spwords [$ I] [1];
$ N = hexdec (bin2hex ($ c ));
If (strlen ($ spwords [$ I]) <= $ this-> SplitLen)
{
}
Else
{
$ This-> ResultStr = $ this-> RunRMM ($ spwords [$ I]). $ spc. $ this-> ResultStr;
}
}
}
Return $ this-> ResultStr;
}
// Reverse matching for all Chinese strings
Function RunRMM ($ str ){
$ Spc = $ this-> SplitChar;
$ SpLen = strlen ($ str );
$ RsStr = "";
$ OkWord = "";
$ TmpWord = "";
$ WordArray = Array ();
// Reverse dictionary matching
For ($ I = ($ spLen-1); $ I> = 0 ;){
// When I reaches the minimum possible word
If ($ I <= $ this-> MinLen ){
If ($ I = 1 ){
$ WordArray [] = substr ($ str, 0, 2 );
} Else
{
$ W = substr ($ str, 0, $ this-> MinLen + 1 );
If ($ this-> IsWord ($ w )){
$ WordArray [] = $ w;
} Else {
$ WordArray [] = substr ($ str, 2, 2 );
$ WordArray [] = substr ($ str, 0, 2 );
}
}
$ I =-1; break;
}
// Analyze the situation above the minimum word
If ($ I >=$ this-> MaxLen) $ maxPos = $ this-> MaxLen;
Else $ maxPos = $ I;
$ IsMatch = false;
For ($ j = $ maxPos; $ j> = 0; $ j = $ J-2 ){
$ W = substr ($ str, $ I-$ j, $ j + 1 );
If ($ this-> IsWord ($ w )){
$ WordArray [] = $ w;
$ I = $ I-$ J-1;
$ IsMatch = true;
Break;
}
}
}
$ RsStr = $ this-> otherword ($ WordArray );
Return $ rsStr;
}
Function otherword ($ WordArray ){
$ Wlen = count ($ WordArray)-1; // calculates the number of elements in the array.
$ RsStr = ""; // initialize the variable
$ Spc = $ this-> SplitChar;
For ($ I = $ wlen; $ I >=0; $ I --)
{
$ RsStr. = $ spc. $ WordArray [$ I]. ","; // split the array into a comma
}
// Returns the segmentation result of this segment.
$ RsStr = preg_replace ("/^". $ spc. "/", $ rsStr );
Return $ rsStr;
}
// Determine whether a word exists in the dictionary
Function IsWord ($ okWord ){
$ Slen = strlen ($ okWord );
If ($ slen> $ this-> MaxLen) return false;
Else return isset ($ this-> RankDic [$ slen] [$ okWord]);
}
// Sort strings (initial handling of punctuation marks, Chinese and English mixing)
Function UpdateStr ($ str ){
$ Spc = $ this-> SplitChar;
$ Slen = strlen ($ str );
If ($ slen = 0) return '';
$ Okstr = '';
$ Prechar = 0; // 0-blank 1-English 2-Chinese 3-Symbol
For ($ I = 0; $ I <$ slen; $ I ++ ){
If (ord ($ str [$ I]) <0x81 ){
// Blank characters in English
If (ord ($ str [$ I]) <33 ){
If ($ prechar! = 0 & $ str [$ I]! = "\ R" & $ str [$ I]! = "\ N") $ okstr. = $ spc;
$ Prechar = 0;
Continue;
} Else if (ereg ("[^ 0-9a-zA-Z @ \. % #:/\ & _-]", $ str [$ I]) {
If ($ prechar = 0) {$ okstr. = $ str [$ I]; $ prechar = 3 ;}
Else {$ okstr. = $ spc. $ str [$ I]; $ prechar = 3 ;}
} Else {
If ($ prechar = 2 | $ prechar = 3)
{$ Okstr. = $ spc. $ str [$ I]; $ prechar = 1 ;}
Else
{
If (ereg ("@ # %:", $ str [$ I]) {$ okstr. = $ str [$ I]; $ prechar = 3 ;}
Else {$ okstr. = $ str [$ I]; $ prechar = 1 ;}
}
}
}
Else {
// If the previous character is not Chinese or non-space, add a space.
If ($ prechar! = 0 & $ prechar! = 2) $ okstr. = $ spc;
// If Chinese Characters
If (isset ($ str [$ I + 1]) {
$ C = $ str [$ I]. $ str [$ I + 1];
$ N = hexdec (bin2hex ($ c ));
If ($ n <0xA13F & $ n> 0xAA40 ){
If ($ prechar! = 0) $ okstr. = $ spc. $ c;
Else $ okstr. = $ c;
$ Prechar = 3;
}
Else {
$ Okstr. = $ c;
$ Prechar = 2;
}
$ I ++;
}
}
}
Return $ okstr;
}
}
?>