Php simple Chinese word segmentation system (12 ). Php simple Chinese word segmentation system structure: hash of the first word, Trie index tree node advantages: in word segmentation, you do not need to predict the length of the word to be queried, match word by word along the tree chain. Disadvantages: structure and maintenance comparison php simple Chinese word segmentation system structure: hash list, Trie index tree node advantages: Word segmentation, do not need to predict the length of the word to be queried, match word by word along the tree chain. Disadvantages: the construction and maintenance are complicated, and there are many word branches, which wastes some space.
Php simple Chinese word segmentation system
Structure: hash of the first word and trie index tree node
Advantage: in word segmentation, you do not need to predict the length of the word to be queried, and match the word by word along the tree chain.
Disadvantages: the construction and maintenance are complicated, and there are many word branches, which wastes some space.
* @ Version 0.1
* @ Todo construct a General Dictionary algorithm and write a simple word segmentation
* @ Author shjuto@gmail.com
* Trie dictionary tree
*
*/
Class trie
{
Private $ trie;
Function _ construct ()
{
$ Trie = array ('children '=> array (), 'isword' => false );
}
/**
* Add words to the dictionary
*
* @ Param string $ key
*/
Function & setword ($ word = '')
{
$ Trienode = & $ this-> trie;
For ($ I = 0; $ I <strlen ($ word); $ I ++)
{
$ Character = $ word [$ I];
If (! Isset ($ trienode ['Children '] [$ character])
{
$ Trienode ['Children '] [$ character] = array ('isword' => false );
}
If ($ I = strlen ($ word)-1)
{
$ Trienode ['Children '] [$ character] = array ('isword' => true );
}
$ Trienode = & $ trienode ['Children '] [$ character];
}
}
/**
* Determine whether it is a dictionary word
*
* @ Param string $ word
* @ Return bool true/false
*/
Function & isword ($ word)
{
$ Trienode = & $ this-> trie;
For ($ I = 0; $ I <strlen ($ word); $ I ++)
{
$ Character = $ word [$ I];
If (! Isset ($ trienode ['Children '] [$ character])
{
Return false;
}
Else
{
// Judge the end of a word
If ($ I = (strlen ($ word)-1) & $ trienode ['Children '] [$ character] ['isword'] = true)
{
Return true;
}
Elseif ($ I = (strlen ($ word)-1) & $ trienode ['Children '] [$ character] ['isword'] = false)
{
Return false;
}
$ Trienode = & $ trienode ['Children '] [$ character];
}
}
}
/**
* Find the position where the word appears in the text $ text
*
* @ Param string $ text
* @ Return array ('position' => $ position, 'word' => $ word );
*/
Function search ($ text = "")
{
$ Textlen = strlen ($ text );
$ Trienode = $ tree = $ this-> trie;
$ Find = array ();
$ Wordrootposition = 0; // The Root position
$ Prenode = false; // callback parameter. when the dictionary AB is in the string aab, you need to trace $ I forward once.
$ Word = '';
For ($ I = 0; $ I <$ textlen; $ I ++)
{
If (isset ($ trienode ['Children '] [$ text [$ I])
{
$ Word = $ word. $ text [$ I];
$ Trienode = $ trienode ['Children '] [$ text [$ I];
If ($ prenode = false)
{
$ Wordrootposition = $ I;
}
$ Prenode = true;
If ($ trienode ['isword'])
{
$ Find [] = array ('position' => $ wordrootposition, 'word' => $ word );
}
}
Else
{
$ Trienode = $ tree;
$ Word = '';
If ($ prenode)
{
$ I = $ I-1;
$ Prenode = false;
}
}
}
Return $ find;
}
}
1 2
Bytes. Disadvantages: Comparison of construction and maintenance...