Php mona1 word segmentation algorithm. Copy the code as follows: * ** unary word segmentation algorithm * UTF8: encode the next character. if the ASCII code of the first character is not greater than 192, it only occupies 1 byte * If the ASCII code of the first character is greater than 192 and less than 224, it occupies
The code is as follows:
/**
* Mona1 word segmentation algorithm
* UTF8: encode the next character. if the first ASCII code is not greater than 192, it only occupies 1 byte.
* If the first ASCII code is greater than 192 or less than 224, it occupies 2 bytes; otherwise, it occupies 3 bytes.
* For one-dollar word splitting, you must add ft_min_word_len = 1 to the mysql my. ini file.
* You can use the mysql Query statement show variables like '% ft %' to view mysql full-text search settings.
*
* @ Access global
* @ Param string $ str
* @ Param boolean $ whether to remove duplicate values
* @ Param boolean $ whether merge combines additional value
* @ Return array
*/
Function seg_word ($ str, $ unique = false, $ merge = true)
{
$ Str = trim (strip_tags ($ str ));
$ Strlen = strlen ($ str );
If ($ strlen = 0) return array ();
$ Spc = '';
// Add the characters to be filtered as needed
$ Search = array (',','/','\\','.',';',':','\'','! ','~ ',' "', ''',' ^ ','(',')','? ','-', "\ T", "\ n",' \ '', '<', '>'," \ r ", "\ r \ n", '\ $', '&', '%', '#', '@', '+', '= ','{', '}', '[', ']', ')', '(', '. ','. ',',','! ','; ',' [','] ','-', ','-','… ','【',']',':');
$ Numpairs = array ('1' => '1', '2' => '2', '3' => '3 ', '4' => '4', '5' => '5', '6' => '6', '7' => '7 ', '8' => '8', '9' => '9', '0' => '0 ');
$ Str = alab_num ($ str );
$ Str = str_replace ($ search, '', $ str );
$ Ord = $ I = $ k = 0;
$ Prechar = 0; // 0-Blank 1-English and symbol 2-Chinese
$ Result = array ();
$ Annex = array ();
While ($ ord = ord ($ str [$ I])
{
// 1 byte
If ($ ord <= 0xC0)
{
// Remove null strings
If ($ ord <33 ){
$ Prechar = 0;
$ I ++;
$ K ++;
Continue;
}
// Add Chinese uppercase numbers for conversion
If (isset ($ numpairs [$ str [$ I]) {
$ Annex [] = $ numpairs [$ str [$ I];
}
// If the front is Chinese
If ($ prechar = 2 ){
$ Result [+ $ k] = $ str [$ I];
}
Else {
$ Result [$ k]. = $ str [$ I];
}
$ Prechar = 1;
$ I ++;
}
Else // 2-3 bytes (Chinese)
{
If ($ ord <0xE0)
$ Step = 2;
Else
$ Step = 3;
$ C = substr ($ str, $ I, $ step );
If (false! ==$ Key = array_search ($ c, $ numpairs )){
$ Annex [] = $ key;
}
If ($ prechar! = 0 ){
$ Result [++ $ k] = $ c;
}
Else {
$ Result [$ k]. = $ c;
}
$ Prechar = 2;
$ I + = $ step;
}
}
$ Result = $ merge? Array_merge ($ result, $ annex): $ result;
Return $ unique? Array_unique ($ result): $ result;
}
The http://www.bkjia.com/PHPjc/320992.htmlwww.bkjia.comtruehttp://www.bkjia.com/PHPjc/320992.htmlTechArticle code is as follows: /*** unary word segmentation algorithm ** UTF8 encode the next character. if the first ASCII code is not greater than 192, it only occupies 1 byte * if the first ASCII code is greater than 192 and less than 224, it occupies ...