Author: livelylittlefish
Source: http://blog.csdn.net/livelylittlefish/archive/2011/05/10/6410569.aspx
This blog (http://blog.csdn.net/livelylittlefish) posted the author (a wave) related research, learning content of the Notes, welcome to the majority of friends correct!
Content
1. Trie Basics
(1) What is it?
(2) Nature
(3) Applications
(4) Advantages
2. An example
(1) Functions
(2) Code
(3) running result
(4) Analysis
3. slightly modify
(1) Analysis
(2) modify
(3) Code
(4) running result
(5) Analysis
(6) A slightly complex result
4. Summary
1. Trie Basics
(1) What is it?
Trie, also known as word search tree or key tree, is a tree structure and a variant of the hash tree.
(2) Nature
The root node does not contain characters. Each node except the root node only contains one character.
From the root node to a node, the character passing through the path is connected to the string corresponding to the node
All subnodes of each node contain different characters.
For example, the trie corresponding to the word sequence A, to, tea, Ted, ten, I, in, Inn.
(3) Applications
It is used to count and sort a large number of strings, but not limited to strings. Therefore, it is often used by the search engine system for text word frequency statistics.
(4) Advantages
Minimize unnecessary string comparisons
Query efficiency is higher than hash table
2. An example
This column is from the Chinese wiki encyclopedia.
(1) Functions
Enter a string from the console. Each string ends with a carriage return and all strings ends with a string. The program counts the number of occurrences of each string in all input strings.
(2) Code
An example code that implements this function is as follows.
View plaincopy to clipboardprint?
/**
* Trie tree test
* Descriptioin: make statistics on every word for its frequency
* Usage: Input some strings, each followed by a 'enter' character, and end '#'
*/
# Include <stdio. h>
# Include <stdlib. h>
# Include <string. h>
# Define max_char_number 256
# Define max_word_len 128
Struct trie_node
{
Int count _;
Struct trie_node * Next _ [max_char_number];
};
Static struct trie_node root = {0, {null }};
Static char * spaces = "/T/N /./"/'()";
Static int insert (const char * word)
{
Int loop;
Struct trie_node * cur, * newnode;
If (word [0] = '/0 ')
Return 0;
Cur = & root;
For (loop = 0; ++ loop)
{
If (cur-> next _ [word [loop] = NULL)
{
Newnode = (struct trie_node *) malloc (sizeof (struct trie_node ));
Memset (newnode, 0, sizeof (struct trie_node ));
Cur-> next _ [word [loop] = newnode;
}
If (word [loop] = '/0 ')
Break;
Cur = cur-> next _ [word [loop];
}
Cur-> count _ ++;
Return 0;
}
Void input ()
{
Char * linebuf = NULL, * line = NULL, * word = NULL;
Size_t bufsize = 0;
Int ret;
While (1)
{
Ret = Getline (& linebuf, & bufsize, stdin );
If (ret =-1)
Break;
Line = linebuf;
WORD = strsep (& line, spaces );
/* Input' # 'will terminate this input */
If (strcmp (word, "#") = 0)
Break;
If (word [0] = '/0 ')
Continue;
Insert (Word );
}
}
Static void printword (const char * STR, int N)
{
Printf ("% S/T % d/N", STR, N );
}
Static int traverse (struct trie_node * rootp)
{
Static char worddump [max_word_len + 1];
Static int Pos = 0;
Int I;
If (rootp = NULL)
Return 0;
If (rootp-> count _)
{
Worddump [POS] = '/0 ';
Printword (worddump, rootp-> count _);
}
For (I = 0; I <max_char_number; ++ I)
{
Worddump [POS ++] = I;
Traverse (rootp-> next _ [I]);/* recursive call */
Pos --;
}
Return 0;
}
Void dump (struct trie_node * node)
{
Static int depth = 0;
Static const char prefix [] = "";
Int loop = 0;
If (node = NULL)
Return;
For (loop = 0; loop <max_char_number; loop ++)
{
If (node-> next _ [loop])
{
Printf ("%. * s", (INT) Depth ++, prefix );
Printf ("next ['% C'] = 0x % x, Count = % d/N", loop, (unsigned INT) (node-> next _ [loop]), node-> next _ [loop]-> count _);
Dump (node-> next _ [loop]);/* recursive call */
Depth --;
}
}
}
Int main (void)
{
Input ();
Printf ("/N ");
Traverse (& root );
Printf ("/N ");
Dump (& root );
Return 0;
}
/**
* Trie tree test
* Descriptioin: make statistics on every word for its frequency
* Usage: Input some strings, each followed by a 'enter' character, and end '#'
*/
# Include <stdio. h>
# Include <stdlib. h>
# Include <string. h>
# Define max_char_number 256
# Define max_word_len 128
Struct trie_node
{
Int count _;
Struct trie_node * Next _ [max_char_number];
};
Static struct trie_node root = {0, {null }};
Static char * spaces = "/T/N /./"/'()";
Static int insert (const char * word)
{
Int loop;
Struct trie_node * cur, * newnode;
If (word [0] = '/0 ')
Return 0;
Cur = & root;
For (loop = 0; ++ loop)
{
If (cur-> next _ [word [loop] = NULL)
{
Newnode = (struct trie_node *) malloc (sizeof (struct trie_node ));
Memset (newnode, 0, sizeof (struct trie_node ));
Cur-> next _ [word [loop] = newnode;
}
If (word [loop] = '/0 ')
Break;
Cur = cur-> next _ [word [loop];
}
Cur-> count _ ++;
Return 0;
}
Void input ()
{
Char * linebuf = NULL, * line = NULL, * word = NULL;
Size_t bufsize = 0;
Int ret;
While (1)
{
Ret = Getline (& linebuf, & bufsize, stdin );
If (ret =-1)
Break;
Line = linebuf;
WORD = strsep (& line, spaces );
/* Input' # 'will terminate this input */
If (strcmp (word, "#") = 0)
Break;
If (word [0] = '/0 ')
Continue;
Insert (Word );
}
}
Static void printword (const char * STR, int N)
{
Printf ("% S/T % d/N", STR, N );
}
Static int traverse (struct trie_node * rootp)
{
Static char worddump [max_word_len + 1];
Static int Pos = 0;
Int I;
If (rootp = NULL)
Return 0;
If (rootp-> count _)
{
Worddump [POS] = '/0 ';
Printword (worddump, rootp-> count _);
}
For (I = 0; I <max_char_number; ++ I)
{
Worddump [POS ++] = I;
Traverse (rootp-> next _ [I]);/* recursive call */
Pos --;
}
Return 0;
}
Void dump (struct trie_node * node)
{
Static int depth = 0;
Static const char prefix [] = "";
Int loop = 0;
If (node = NULL)
Return;
For (loop = 0; loop <max_char_number; loop ++)
{
If (node-> next _ [loop])
{
Printf ("%. * s", (INT) Depth ++, prefix );
Printf ("next ['% C'] = 0x % x, Count = % d/N", loop, (unsigned INT) (node-> next _ [loop]), node-> next _ [loop]-> count _);
Dump (node-> next _ [loop]);/* recursive call */
Depth --;
}
}
}
Int main (void)
{
Input ();
Printf ("/N ");
Traverse (& root );
Printf ("/N ");
Dump (& root );
Return 0;
}
(3) running result
#./Trie
A
To
Tea
Ted
Ten
I
In
Inn
#
A 1
I 1
In 1
Inn 1
Tea 1
Ted 1
Ten 1
To 1
Next ['a'] = 0x88c1088, Count = 1
Next [''] = 0x88c1490, Count = 0
Next ['I'] = 0x88c40e8, Count = 1
Next [''] = 0x88c44f0, Count = 0
Next ['n'] = 0x88c48f8, Count = 1
Next [''] = 0x88c4d00, Count = 0
Next ['n'] = 0x88c5108, Count = 1
Next [''] = 0x88c5510, Count = 0
Next ['T'] = 0x88c1898, Count = 0
Next ['E'] = 0x88c24b0, Count = 0
Next ['a'] = 0x88c28b8, Count = 1
Next [''] = 0x88c2cc0, Count = 0
Next ['D'] = 0x88c30c8, Count = 1
Next [''] = 0x88c34d0, Count = 0
Next ['n'] = 0x88c38d8, Count = 1
Next [''] = 0x88c3ce0, Count = 0
Next ['O'] = 0x88c1ca0, Count = 1
Next [''] = 0x88c20a8, Count = 0
(4) Analysis
In this program, the author adds the dump function to print the values that are not empty in the next array recursively, as shown in.
Number of times added, such. The number on the edge (including the real side and the virtual side) is the subscript of the next array, and the number is the ASCII code of the character to the node of the edge. In the program, the character itself is actually the ASCII code of the character, which is used as the subscript of the next array. In this way, the O (1) access efficiency can be achieved without searching and direct access.
As shown in the figure
Next [97] indicates the number of occurrences of the string ";
The number of times next [105] is also 1, that is, the number of times the string "I;
Next [105]-> next [110] appears as 1, that is, the number of times the string "in;
Next [105]-> next [110]-> next [110] appears 1, that is, the number of times the string "Inn;
...
How do you feel? Is it intuitive?
-- Of course!
3. slightly modify
(1) Analysis
As in the preceding example, the program has counted the number of occurrences of next [105]-> next [110] as 1, that is, the number of occurrences of string "in" is 1, why is there a value for next [105]-> next [110]-> next [0? That is, the next [0] pointer actually points to a node, although the value of this node is all 0.
If the number of strings to be counted is large, the number of leaf nodes is large, or the number of nodes such as 'I is large, the waste of space is also obvious, because from the program, we know that the size of each node is sizeof (struct trie_node) = 1028 bytes. If there are 10000 such nodes, 1028*10000 bytes are wasted, about 10 MB.
(2) modify
Debug the code above and find the insert function. When a string is inserted into the tire tree, the exit condition of the for loop can be modified as follows.
Static int insert (const char * word)
{
Int loop;
Struct trie_node * cur, * newnode;
If (word [0] = '/0 ')
Return 0;
Cur = & root;
For (loop = 0; ++ loop)
{
If (word [loop] = '/0')/* The Break condition shocould be here */
Break;
If (cur-> next _ [word [loop] = NULL)
{
Newnode = (struct trie_node *) malloc (sizeof (struct trie_node ));
Memset (newnode, 0, sizeof (struct trie_node ));
Cur-> next _ [word [loop] = newnode;
}
Cur = cur-> next _ [word [loop];
}
Cur-> count _ ++;
Return 0;
}
(3) Code
Omitted.
(4) running result
#./Trie
A
To
Tea
Ted
Ten
I
In
Inn
#
A 1
I 1
In 1
Inn 1
Tea 1
Ted 1
Ten 1
To 1
Next ['a'] = 0x8bb8088, Count = 1
Next ['I'] = 0x8bb9cc0, Count = 1
Next ['n'] = 0x8bba0c8, Count = 1
Next ['n'] = 0x8bba4d0, Count = 1
Next ['T'] = 0x8bb8490, Count = 0
Next ['E'] = 0x8bb8ca0, Count = 0
Next ['a'] = 0x8bb90a8, Count = 1
Next ['D'] = 0x8bb94b0, Count = 1
Next ['n'] = 0x8bb98b8, Count = 1
Next ['O'] = 0x8bb8898, Count = 1
(5) Analysis
Comparing the running results before and after modification, we found that the modified program no longer contains the next [0] node. For example.
It can also be seen that the root node
Next [97] indicates the number of occurrences of the string ";
The number of times next [105] is also 1, that is, the number of times the string "I;
Next [105]-> next [110] appears as 1, that is, the number of times the string "in;
Next [105]-> next [110]-> next [110] appears 1, that is, the number of times the string "Inn;
...
Compared with the previous graph, this graph does not have junk data of next [0], which is more intuitive.
(6) A slightly complex result
#./Trie
A
To
Tea
Ted
Ten
I
In
Inn
A
To
Tea
Ted
Ten
I
In
Inn
A
I
A
A
#
A 5
I 3
In 2
Inn 2
Tea 2
Ted 2
Ten 2
To 2
Next ['a'] = 0x9604088, Count = 5
Next ['I'] = 0x9605cc0, Count = 3
Next ['n'] = 0x96060c8, Count = 2
Next ['n'] = 0x96064d0, Count = 2
Next ['T'] = 0x9604490, Count = 0
Next ['E'] = 0x9604ca0, Count = 0
Next ['a'] = 0x96050a8, Count = 2
Next ['D'] = 0x96054b0, Count = 2
Next ['n'] = 0x96058b8, Count = 2
Next ['O'] = 0x9604898, Count = 2
4. Summary
This article briefly introduces the basic concepts of the tire tree and uses an example to describe its application.
When designing a program, you can use some tips to improve the program efficiency. For example, this article uses the character itself, that is, its ASCII code value, as the next array subscript, to obtain O (1). You must also use the (loop) boundary condition to eliminate junk data.
Reference
Http://zh.wikipedia.org/wiki/Trie
Http://en.wikipedia.org/wiki/Trie
Http://en.wikipedia.org/wiki/Hash_trie
Http://en.wikipedia.org/wiki/Hash_array_mapped_trie
Http://www.topcoder.com/tc? Module = static & d1 = tutorials & D2 = usingtries
This article from the csdn blog, reproduced please indicate the source: http://blog.csdn.net/livelylittlefish/archive/2011/05/10/6410569.aspx