This procedure is based on RMM Chinese word segmentation thought, the simple Chinese word segmentation, the procedure still has many loopholes, hope the big God pointing .... Optimized the next garbled problem
- /**
- * Based on RMM Chinese word segmentation (inverse matching method)
- * @author Tangpan
- * @date 2013-10-12
- * @version 1.0.0
- **/
- Class Splitword {
- Public $Tag _dic = Array (); Store dictionary Participle
- Public $Rec _dic = Array (); Store a reorganized word breaker
- Public $Split _char = "; Separator
- Public $Source _str = "; Storing source strings
- Public $Result _str = "; Store word breaker result string
- Public $limit _lenght = 2;
- Public $Dic _maxlen = 28; Maximum length of dictionary morphemes
- Public $Dic _minlen = 2; Minimum length of dictionary morphemes
- Public Function Splitword () {//initializes the object and automatically executes the member method
- $this->__construct ();
- }
- Public Function __construct () {
- $dic _path = dirname (__file__). ' /words.csv '; Pre-load dictionaries to increase word segmentation speed
- $fp = fopen ($dic _path, ' R '); Reading words from a thesaurus
- while ($line = fgets ($FP, 256)) {
- $ws = Explode (' ', $line); Segmentation of words in the word library
- $WS [0] = Trim (iconv (' utf-8 ', ' GBK ', $ws [0])); Encoding Conversion
- $this->tag_dic[$ws [0]] = true; Indexed by word, ordinal value
- $this->rec_dic[strlen ($ws [0]) [$WS [0]] = true; Use the word length and words as the index of the two-dimensional array, and use N as the value to reorganize the thesaurus.
- }
- Fclose ($FP); Close Word Store
- }
- /**
- * Set Source string
- * @param the string to be participle
- */
- Public Function Setsourcestr ($STR) {
- $str = Iconv (' utf-8 ', ' GBK ', $str); Convert Utf-8 encoded characters to GBK encoding
- $this->source_str = $this->dealstr ($STR); Preliminary processing of strings
- }
- /**
- * Check String
- * @param $str Source string
- * @return BOOL
- */
- Public Function Checkstr ($STR) {
- if (Trim ($str) = = ") return; If the string is empty, return directly
- if (Ord ($str [0]) > 0x80) return true; is a Chinese character returns true
- else return false; return False if not a Chinese character
- }
- /**
- * RMM Segmentation algorithm
- * @param $str Pending string
- */
- Public Function splitrmm ($str = ") {
- if (Trim ($str) = = ") return; If the string is empty, it is returned directly
- else $this->setsourcestr ($STR); Sets the source string when the string is not empty
- if ($this->source_str = = ") return; When the source string is empty, return directly
- $split _words = Explode (", $this->source_str); To slice a string with a space
- $lenght = count ($split _words); Calculating the length of an array
- for ($i = $lenght-1; $i >= 0; $i--) {
- if (Trim ($split _words[$i]) = = ") continue; If the character is empty, skip the following code and go directly to the next loop
- if ($this->checkstr ($split _words[$i])) {//Check the string if it is a Chinese character
- if (strlen ($split _words[$i]) >= $this->limit_lenght) {//String length greater than limit large hours
- To reverse match a string
- $this->result_str = $this->pregrmmsplit ($split _words[$i]). $this->split_char. $this->result_str;
- }
- } else {
- $this->result_str = $split _words[$i]. $this->split_char. $this->result_str;
- }
- }
- $this->clear ($split _words); Freeing memory
- Return Iconv (' GBK ', ' utf-8 ', $this->result_str);
- }
- /**
- * Decomposition of Chinese strings by inverse matching method
- * @param $str string
- * @return string $retStr participle completed
- */
- Public Function Pregrmmsplit ($STR) {
- if ($str = = ") return;
- $splen = strlen ($STR);
- $Split _result = Array ();
- for ($j = $splen-1; $j >= 0; $j-) {//Inverse match character
- if ($splen <= $this->dic_minlen) {//when the length of the character is greater than the minimum length in the dictionary
- if ($j = = 1) {//When length is 1 o'clock
- $Split _result[] = substr ($str, 0, 2);
- } else {
- $w = Trim (substr ($str, 0, $this->dic_minlen + 1)); Intercept the first four characters
- if ($this->isword ($w)) {//determines if the character exists in the dictionary
- $Split _result[] = $w; exists, it is written to the array store
- } else {
- $Split _result[] = substr ($str, 2, 2); Reverse Storage
- $Split _result[] = substr ($str, 0, 2);
- }
- }
- $j =-1; Close the loop;
- Break
- }
- if ($j >= $this->dic_maxlen) $max _len = $this->dic_maxlen; When the length of the character is greater than the maximum word length of the dictionary, the maximum limit length is assigned
- else $max _len = $j;
- for ($k = $max _len; $k >= 0; $k = $k-2) {//tick for one Chinese character
- $w = Trim (substr ($str, $j-$k, $k + 1));
- if ($this->isword ($w)) {
- $Split _result[] = $w; Save the word
- $j = $j-$k-1; Position moved to the position of the matched character
- Break The success of the participle jumps out of the current loop and into the next loop
- }
- }
- }
- $RETSTR = $this->resetword ($Split _result); Reorganize the string and return the processed string
- $this->clear ($Split _result); Freeing memory
- return $retStr;
- }
- /**
- * Re-identify and combine participle
- * @param $Split _result Recombinant target string
- * @return $ret _str reassembly string
- */
- Public Function Resetword ($Split _result) {
- if (Trim ($Split _result[0]) = = ") return;
- $Len = count ($Split _result)-1;
- $ret _str = ";
- $SPC = $this->split_char;
- for ($i = $Len; $i >= 0; $i--) {
- if (Trim ($Split _result[$i])! = ") {
- $Split _result[$i] = iconv (' GBK ', ' utf-8 ', $Split _result[$i]);
- $ret _str. = $spc. $Split _result[$i]. ' ';
- }
- }
- $ret _str = preg_replace ('/^ '. $spc. ' /', ', ', $ret _str);
- $ret _str = iconv (' utf-8 ', ' GBK ', $ret _str);
- return $ret _str;
- }
- /**
- * Check if a word exists in the dictionary
- * @param $okWord Check the words
- * @return BOOL;
- */
- Public Function Isword ($okWord) {
- $len = strlen ($okWord);
- if ($len > $this->dic_maxlen + 1) return false;
- else {//match based on two-dimensional array index, whether the word exists
- return Isset ($this->rec_dic[$len [$okWord]);
- }
- }
- /**
- * Initial processing of strings (with spaces to replace special characters)
- * @param $str The source string to be processed
- * @return $okStr return the preprocessed string
- */
- Public Function Dealstr ($STR) {
- $SPC = $this->split_char; Copy Separator
- $slen = strlen ($STR); Calculate the length of a character
- if ($slen = = 0) return; If the character length is 0, return directly
- $okstr = "; Initialize variables
- $prechar = 0; Character judgment variable (0-blank, 1-English, 2-Chinese, 3-symbol)
- for ($i = 0; $i < $slen; $i + +) {
- $str _ord = Ord ($str [$i]);
- if ($str _ord < 0x81) {//If it is an English character
- if ($str _ord < 33) {//blank symbol in English
- if ($str [$i]! = ' \ r ' && $str [$i]! = ' \ n ')
- $okstr. = $SPC;
- $prechar = 0;
- Continue
- } else if (Ereg (' [@\.%#:\^\&_-] ', $str [$i])) {//If the character of the keyword is a number or English or special character
- if ($prechar = = 0) {//when character is blank
- $okstr. = $str [$i];
- $prechar = 3;
- } else {
- $okstr. = $spc. $str [$i]; Character is not a white space, a white space character is on the front string
- $prechar = 3;
- }
- } else if (Ereg (' [0-9a-za-z] ', $str [$i])) {//split English number combination
- if ((Ereg (' [0-9] ', $str [$i-1]) && ereg (' [a-za-z] ', $str [$i]))
- || (Ereg (' [a-za-z] ', $str [$i-1]) && ereg (' [0-9] ', $str [$i]))) {
- $okstr. = $spc. $str [$i];
- } else {
- $okstr. = $str [$i];
- }
- }
- } else {//if the second character of a keyword is a kanji
- if ($prechar! = 0 && $prechar! = 2)//If the previous character is non-Chinese and non-whitespace, add a space
- $okstr. = $SPC;
- if (Isset ($str [$i +1])) {//If it is a Chinese character
- $c = $str [$i]. $str [$i +1]; Combine two strings together to form a Chinese text
- $n = Hexdec (Bin2Hex ($c)); Converts ASCII code to 16, and then into 10 binary
- if ($n > 0xa13f && $n < 0XAA40) {//if Chinese punctuation marks
- if ($prechar! = 0) $okstr. = $SPC; Replace Chinese punctuation with an empty
- else $okstr. = $SPC; If the previous character is empty, the string directly
- $prechar = 3;
- } else {//not Chinese punctuation
- $okstr. = $c;
- $prechar = 2;
- }
- $i + +; $i plus 1, even if you move to one Chinese character at a time
- }
- }
- }
- return $okstr;
- }
- /**
- * Free Memory
- * @param $data Staging data
- */
- Public function Clear ($data) {
- Unset ($data); Delete staging data
- }
- }
- ?>
Copy Code |