Copy CodeThe code is as follows:
/**
* One-dollar segmentation algorithm
* UTF8 encoding next character if the first character ASCII code is not greater than 192, it only accounts for 1 bytes.
* If the first character ASCII code greater than 192 is less than 224 takes 2 bytes, otherwise takes up 3 bytes
* Unary participle needs to be added in MySQL my.ini file ft_min_word_len=1
* Can use MySQL query statement show variables like '%ft% ' view MySQL full text search related settings
*
* @access Global
* @param string $str
* @param boolean $unique If duplicate values are removed
* @param Boolean $merge whether to combine added value
* @return Array
*/
function Seg_word ($str, $unique =false, $merge =true)
{
$str = Trim (Strip_tags ($STR));
$strlen = strlen ($STR);
if ($strlen = = 0) return array ();
$SPC = ";
Increase the number of characters you want to filter on demand
$search = Array (', ', '/', ' \ \ ', '. ', '; ', ': ', ' \ ', '! '), ' ~ ', ' ' ' ', ' ', ' ^ ', ' (', ') ', '? ', '-', ' \ t ', ' \ n ', ' \ ', ' < ', ' > ', ' \ R ', ' \ r \ n ', ' \$ ', ' & ', '% ', ' # ', ' @ ', ' + ', ' = ', ' {', '} ', ' [', '] ', ', ', ' (', '. ', '。 ', ',', '! ', ';', '“', '”', '‘', ''', '[', ']', '、', '—', ' ', '《', '》', '-', '...', '【', '】',':');
$numpairs = Array (' 1 ' = ' one ', ' 2 ' = ' two ', ' 3 ' = ' three ', ' 4 ' = ' Four ', ' 5 ' = ' five ', ' 6 ' = ' Six ', ' 7 ' = ' seven ', ' 8 ' = ' = ') Eight ', ' 9 ' = ' nine ', ' 0 ' = ' 0 ');
$str = Alab_num ($STR);
$str = Str_replace ($search, ", $STR);
$ord = $i = $k = 0;
$prechar = 0;//0-blank 1-English and Chinese symbols
$result = Array ();
$annex = Array ();
while ($ord = Ord ($str [$i])
{
1-byte characters
if ($ord <= 0xC0)
{
Remove empty string
if ($ord < 33) {
$prechar = 0;
$i + +;
$k + +;
Continue
}
Additional Chinese capitalization number conversion
if (Isset ($numpairs [$STR [$i]]) {
$annex []= $numpairs [$STR [$i]];
}
If the previous Chinese
if ($prechar = = 2) {
$result [+ + $k] = $str [$i];
}
else {
$result [$k]. = $str [$i];
}
$prechar = 1;
$i + +;
}
else//2-3 Byte character (Chinese)
{
if ($ord < 0xE0)
$step = 2;
Else
$step = 3;
$c = substr ($str, $i, $step);
if (false!== $key = Array_search ($c, $numpairs)) {
$annex [] = $key;
}
if ($prechar! = 0) {
$result [+ + $k] = $c;
}
else {
$result [$k]. = $c;
}
$prechar = 2;
$i + = $step;
}
}
$result = $merge? Array_merge ($result, $annex): $result;
Return $unique? Array_unique ($result): $result;
}
http://www.bkjia.com/PHPjc/320992.html www.bkjia.com true http://www.bkjia.com/PHPjc/320992.html techarticle Copy the code code as follows:/** * unary word algorithm * UTF8 encoding next character if the first character ASCII code is not greater than 192 only 1 bytes * If the first character ASCII code greater than 192 is less than 224 is accounted for ...