Chinese
A Chinese part of speech written in PHP
<?php
Class Segmentation {
var $options = array (' lowercase ' => TRUE,
' Segment_english ' => FALSE);
var $dict _name = ' Unknown ';
var $dict _words = Array ();
function Setlowercase ($value) {
if ($value) {
$this->options[' lowercase '] = TRUE;
} else {
$this->options[' lowercase '] = FALSE;
}
return TRUE;
}
function Setsegmentenglish ($value) {
if ($value) {
$this->options[' segment_english '] = TRUE;
} else {
$this->options[' segment_english '] = FALSE;
}
return TRUE;
}
function load ($dict _file) {
if (!file_exists ($dict _file)) {
return FALSE;
}
$fp = fopen ($dict _file, ' R ');
$temp = fgets ($fp, 1024);
if ($temp = = FALSE) {
return FALSE;
} else {
if (Strpos ($temp, "\ T")!== FALSE) {
List ($dict _type, $dict _name) = Explode ("T", trim ($temp));
} else {
$dict _type = Trim ($temp);
$dict _name = ' Unknown ';
}
$this->dict_name = $dict _name;
if ($dict _type!== ' Dict_word_w ') {
return FALSE;
}
}
while (!feof ($fp)) {
$this->dict_words[rtrim (Fgets ($FP, 32))] = 1;
}
Fclose ($FP);
return TRUE;
}
function Getdictname () {
return $this->dict_name;
}
function segmentstring ($STR) {
if (count ($this->dict_words) = = 0) {
return FALSE;
}
$lines = explode ("\ n", $str);
return $this->_segmentlines ($lines);
}
function Segmentfile ($filename) {
if (count ($this->dict_words) = = 0) {
return FALSE;
}
$lines = file ($filename);
return $this->_segmentlines ($lines);
}
function _segmentlines ($lines) {
$contents _segmented = ';
foreach ($lines as $line) {
$contents _segmented. = $this->_segmentline (RTrim ($line)). "\ n";
}
do {
$contents _segmented = Str_replace (', ', $contents _segmented);
while (Strpos ($contents _segmented, ')!== FALSE);
return $contents _segmented;
}
function _segmentline ($STR) {
$str _final = ';
$str _array = Array ();
$str _length = strlen ($STR);
if ($str _length > 0) {
if (Ord ($str {$str _length-1}) >= 129) {
$str. = ';
}
}
for ($i =0; $i < $str _length; $i + +) {
if (Ord ($str {$i}) >= 129) {
$str _array[] = $str {$i}. $str {$i +1};
$i + +;
} else {
$str _tmp = $str {$i};
for ($j = $i +1; $j < $str _length; $j + +) {
if (Ord ($str {$j}) < 129) {
$str _tmp. = $str {$j};
} else {
Break
}
}
$str _array[] = Array ($str _tmp);
$i = $j-1;
}
}
$pos = count ($str _array);
while ($pos > 0) {
$char = $str _array[$pos-1];
if (Is_array ($char)) {
$str _final_tmp = $char [0];
if ($this->options[' segment_english ']) {
$str _final_tmp = preg_replace ("/[\!\" \#\$\%\&\ ' \) \*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\ ' \{\|\}\~\t \f]+)/"," $ ", $str _final_tmp);
$str _final_tmp = preg_replace ("/[\!\" \#\$\%\&\ ' \) \*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\ ' \{\|\}\~\t \f]) ([\!\ ' \#\$\%\&\ ' (\) \*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\ ' \{\|\}\~\t\f])/"," $ $, $str _final_ TMP);
}
if ($this->options[' lowercase ']) {
$str _final_tmp = strtolower ($str _final_tmp);
}
$str _final = "$str _final_tmp$str_final";
$pos--;
} else {
$word _found = 0;
$word _array = Array (0 => ');
if ($pos < 4) {
$word _temp = $pos + 1;
} else {
$word _temp = 5;
}
for ($i =1; $i < $word _temp; $i + +) {
$word _array[$i] = $str _array[$pos-$i]. $word _array[$i-1];
}
for ($i = ($word _temp-1); $i >1; $i-) {
if (array_key_exists ($word _array[$i], $this->dict_words)) {
$word _found = $i;
Break
}
}
if ($word _found) {
$str _final = "$word _array[$word _found] $str _final";
$pos = $pos-$word _found;
} else {
$str _final = "$char $str_final";
$pos--;
}
}
}
return $STR _final;
}
}
?>
SOURCE reference:
Http://www.phpchina.cn/code/2006/0607/381.html
http://www.xuchao.cn/?play=reply&id=851