Introduction to the trie tree

Source: Internet
Author: User

 

Author: livelylittlefish

Source: http://blog.csdn.net/livelylittlefish/archive/2011/05/10/6410569.aspx

 

This blog (http://blog.csdn.net/livelylittlefish) posted the author (a wave) related research, learning content of the Notes, welcome to the majority of friends correct!

 

Content

1. Trie Basics

(1) What is it?

(2) Nature

(3) Applications

(4) Advantages

2. An example

(1) Functions

(2) Code

(3) running result

(4) Analysis

3. slightly modify

(1) Analysis

(2) modify

(3) Code

(4) running result

(5) Analysis

(6) A slightly complex result

4. Summary

 

 

1. Trie Basics

 

(1) What is it?

Trie, also known as word search tree or key tree, is a tree structure and a variant of the hash tree.

 

(2) Nature

The root node does not contain characters. Each node except the root node only contains one character.
From the root node to a node, the character passing through the path is connected to the string corresponding to the node
All subnodes of each node contain different characters.
 

For example, the trie corresponding to the word sequence A, to, tea, Ted, ten, I, in, Inn.

 

(3) Applications

It is used to count and sort a large number of strings, but not limited to strings. Therefore, it is often used by the search engine system for text word frequency statistics.

 

(4) Advantages

Minimize unnecessary string comparisons
Query efficiency is higher than hash table
 

2. An example

 

This column is from the Chinese wiki encyclopedia.

 

(1) Functions

 

Enter a string from the console. Each string ends with a carriage return and all strings ends with a string. The program counts the number of occurrences of each string in all input strings.

 

(2) Code

 

An example code that implements this function is as follows.

View plaincopy to clipboardprint?
/**
* Trie tree test
* Descriptioin: make statistics on every word for its frequency
* Usage: Input some strings, each followed by a 'enter' character, and end '#'
*/
# Include <stdio. h>
# Include <stdlib. h>
# Include <string. h>
# Define max_char_number 256
# Define max_word_len 128
Struct trie_node
{
Int count _;
Struct trie_node * Next _ [max_char_number];
};
Static struct trie_node root = {0, {null }};
Static char * spaces = "/T/N /./"/'()";
Static int insert (const char * word)
{
Int loop;
Struct trie_node * cur, * newnode;
If (word [0] = '/0 ')
Return 0;
Cur = & root;
For (loop = 0; ++ loop)
{
If (cur-> next _ [word [loop] = NULL)
{
Newnode = (struct trie_node *) malloc (sizeof (struct trie_node ));
Memset (newnode, 0, sizeof (struct trie_node ));
Cur-> next _ [word [loop] = newnode;
}
If (word [loop] = '/0 ')
Break;
Cur = cur-> next _ [word [loop];
}
Cur-> count _ ++;
Return 0;
}
Void input ()
{
Char * linebuf = NULL, * line = NULL, * word = NULL;
Size_t bufsize = 0;
Int ret;
While (1)
{
Ret = Getline (& linebuf, & bufsize, stdin );
If (ret =-1)
Break;
Line = linebuf;
WORD = strsep (& line, spaces );
/* Input' # 'will terminate this input */
If (strcmp (word, "#") = 0)
Break;
If (word [0] = '/0 ')
Continue;
Insert (Word );
}
}
Static void printword (const char * STR, int N)
{
Printf ("% S/T % d/N", STR, N );
}
Static int traverse (struct trie_node * rootp)
{
Static char worddump [max_word_len + 1];
Static int Pos = 0;
Int I;
If (rootp = NULL)
Return 0;
If (rootp-> count _)
{
Worddump [POS] = '/0 ';
Printword (worddump, rootp-> count _);
}
For (I = 0; I <max_char_number; ++ I)
{
Worddump [POS ++] = I;
Traverse (rootp-> next _ [I]);/* recursive call */
Pos --;
}
Return 0;
}
Void dump (struct trie_node * node)
{
Static int depth = 0;
Static const char prefix [] = "";
Int loop = 0;
If (node = NULL)
Return;
For (loop = 0; loop <max_char_number; loop ++)
{
If (node-> next _ [loop])
{
Printf ("%. * s", (INT) Depth ++, prefix );
Printf ("next ['% C'] = 0x % x, Count = % d/N", loop, (unsigned INT) (node-> next _ [loop]), node-> next _ [loop]-> count _);
Dump (node-> next _ [loop]);/* recursive call */
Depth --;
}
}
}
Int main (void)
{
Input ();
Printf ("/N ");
Traverse (& root );
Printf ("/N ");
Dump (& root );
Return 0;
}
/**
* Trie tree test
* Descriptioin: make statistics on every word for its frequency
* Usage: Input some strings, each followed by a 'enter' character, and end '#'
*/
# Include <stdio. h>
# Include <stdlib. h>
# Include <string. h>
# Define max_char_number 256
# Define max_word_len 128
Struct trie_node
{
Int count _;
Struct trie_node * Next _ [max_char_number];
};
Static struct trie_node root = {0, {null }};
Static char * spaces = "/T/N /./"/'()";
Static int insert (const char * word)
{
Int loop;
Struct trie_node * cur, * newnode;
If (word [0] = '/0 ')
Return 0;
Cur = & root;
For (loop = 0; ++ loop)
{
If (cur-> next _ [word [loop] = NULL)
{
Newnode = (struct trie_node *) malloc (sizeof (struct trie_node ));
Memset (newnode, 0, sizeof (struct trie_node ));
Cur-> next _ [word [loop] = newnode;
}
If (word [loop] = '/0 ')
Break;
Cur = cur-> next _ [word [loop];
}
Cur-> count _ ++;
Return 0;
}
Void input ()
{
Char * linebuf = NULL, * line = NULL, * word = NULL;
Size_t bufsize = 0;
Int ret;
While (1)
{
Ret = Getline (& linebuf, & bufsize, stdin );
If (ret =-1)
Break;
Line = linebuf;
WORD = strsep (& line, spaces );
/* Input' # 'will terminate this input */
If (strcmp (word, "#") = 0)
Break;
If (word [0] = '/0 ')
Continue;
Insert (Word );
}
}
Static void printword (const char * STR, int N)
{
Printf ("% S/T % d/N", STR, N );
}
Static int traverse (struct trie_node * rootp)
{
Static char worddump [max_word_len + 1];
Static int Pos = 0;
Int I;
If (rootp = NULL)
Return 0;
If (rootp-> count _)
{
Worddump [POS] = '/0 ';
Printword (worddump, rootp-> count _);
}
For (I = 0; I <max_char_number; ++ I)
{
Worddump [POS ++] = I;
Traverse (rootp-> next _ [I]);/* recursive call */
Pos --;
}
Return 0;
}
Void dump (struct trie_node * node)
{
Static int depth = 0;
Static const char prefix [] = "";
Int loop = 0;
If (node = NULL)
Return;
For (loop = 0; loop <max_char_number; loop ++)
{
If (node-> next _ [loop])
{
Printf ("%. * s", (INT) Depth ++, prefix );
Printf ("next ['% C'] = 0x % x, Count = % d/N", loop, (unsigned INT) (node-> next _ [loop]), node-> next _ [loop]-> count _);
Dump (node-> next _ [loop]);/* recursive call */
Depth --;
}
}
}
Int main (void)
{
Input ();
Printf ("/N ");
Traverse (& root );
Printf ("/N ");
Dump (& root );
Return 0;
}

(3) running result

#./Trie

A

To

Tea

Ted

Ten

I

In

Inn

#

 

A 1

I 1

In 1

Inn 1

Tea 1

Ted 1

Ten 1

To 1

 

Next ['a'] = 0x88c1088, Count = 1

Next [''] = 0x88c1490, Count = 0

Next ['I'] = 0x88c40e8, Count = 1

Next [''] = 0x88c44f0, Count = 0

Next ['n'] = 0x88c48f8, Count = 1

Next [''] = 0x88c4d00, Count = 0

Next ['n'] = 0x88c5108, Count = 1

Next [''] = 0x88c5510, Count = 0

Next ['T'] = 0x88c1898, Count = 0

Next ['E'] = 0x88c24b0, Count = 0

Next ['a'] = 0x88c28b8, Count = 1

Next [''] = 0x88c2cc0, Count = 0

Next ['D'] = 0x88c30c8, Count = 1

Next [''] = 0x88c34d0, Count = 0

Next ['n'] = 0x88c38d8, Count = 1

Next [''] = 0x88c3ce0, Count = 0

Next ['O'] = 0x88c1ca0, Count = 1

Next [''] = 0x88c20a8, Count = 0

(4) Analysis

 

In this program, the author adds the dump function to print the values that are not empty in the next array recursively, as shown in.

 

 

Number of times added, such. The number on the edge (including the real side and the virtual side) is the subscript of the next array, and the number is the ASCII code of the character to the node of the edge. In the program, the character itself is actually the ASCII code of the character, which is used as the subscript of the next array. In this way, the O (1) access efficiency can be achieved without searching and direct access.

 

 

As shown in the figure

Next [97] indicates the number of occurrences of the string ";

The number of times next [105] is also 1, that is, the number of times the string "I;

Next [105]-> next [110] appears as 1, that is, the number of times the string "in;

Next [105]-> next [110]-> next [110] appears 1, that is, the number of times the string "Inn;

...

 

How do you feel? Is it intuitive?

-- Of course!

3. slightly modify

(1) Analysis

 

As in the preceding example, the program has counted the number of occurrences of next [105]-> next [110] as 1, that is, the number of occurrences of string "in" is 1, why is there a value for next [105]-> next [110]-> next [0? That is, the next [0] pointer actually points to a node, although the value of this node is all 0.

 

If the number of strings to be counted is large, the number of leaf nodes is large, or the number of nodes such as 'I is large, the waste of space is also obvious, because from the program, we know that the size of each node is sizeof (struct trie_node) = 1028 bytes. If there are 10000 such nodes, 1028*10000 bytes are wasted, about 10 MB.

 

(2) modify

 

Debug the code above and find the insert function. When a string is inserted into the tire tree, the exit condition of the for loop can be modified as follows.

Static int insert (const char * word)

{

Int loop;

Struct trie_node * cur, * newnode;

 

 

If (word [0] = '/0 ')

Return 0;

 

Cur = & root;

For (loop = 0; ++ loop)

{

If (word [loop] = '/0')/* The Break condition shocould be here */

Break;

 

If (cur-> next _ [word [loop] = NULL)

{

Newnode = (struct trie_node *) malloc (sizeof (struct trie_node ));

Memset (newnode, 0, sizeof (struct trie_node ));

Cur-> next _ [word [loop] = newnode;

}

 

Cur = cur-> next _ [word [loop];

}

Cur-> count _ ++;

 

Return 0;

}

(3) Code

Omitted.

(4) running result

#./Trie

A

To

Tea

Ted

Ten

I

In

Inn

#

 

A 1

I 1

In 1

Inn 1

Tea 1

Ted 1

Ten 1

To 1

 

Next ['a'] = 0x8bb8088, Count = 1

Next ['I'] = 0x8bb9cc0, Count = 1

Next ['n'] = 0x8bba0c8, Count = 1

Next ['n'] = 0x8bba4d0, Count = 1

Next ['T'] = 0x8bb8490, Count = 0

Next ['E'] = 0x8bb8ca0, Count = 0

Next ['a'] = 0x8bb90a8, Count = 1

Next ['D'] = 0x8bb94b0, Count = 1

Next ['n'] = 0x8bb98b8, Count = 1

Next ['O'] = 0x8bb8898, Count = 1

(5) Analysis

 

Comparing the running results before and after modification, we found that the modified program no longer contains the next [0] node. For example.

 

It can also be seen that the root node

Next [97] indicates the number of occurrences of the string ";

The number of times next [105] is also 1, that is, the number of times the string "I;

Next [105]-> next [110] appears as 1, that is, the number of times the string "in;

Next [105]-> next [110]-> next [110] appears 1, that is, the number of times the string "Inn;

...

 

Compared with the previous graph, this graph does not have junk data of next [0], which is more intuitive.

 

(6) A slightly complex result

#./Trie

A

To

Tea

Ted

Ten

I

In

Inn

A

To

Tea

Ted

Ten

I

In

Inn

A

I

A

A

#

 

A 5

I 3

In 2

Inn 2

Tea 2

Ted 2

Ten 2

To 2

 

Next ['a'] = 0x9604088, Count = 5

Next ['I'] = 0x9605cc0, Count = 3

Next ['n'] = 0x96060c8, Count = 2

Next ['n'] = 0x96064d0, Count = 2

Next ['T'] = 0x9604490, Count = 0

Next ['E'] = 0x9604ca0, Count = 0

Next ['a'] = 0x96050a8, Count = 2

Next ['D'] = 0x96054b0, Count = 2

Next ['n'] = 0x96058b8, Count = 2

Next ['O'] = 0x9604898, Count = 2

4. Summary

 

This article briefly introduces the basic concepts of the tire tree and uses an example to describe its application.

When designing a program, you can use some tips to improve the program efficiency. For example, this article uses the character itself, that is, its ASCII code value, as the next array subscript, to obtain O (1). You must also use the (loop) boundary condition to eliminate junk data.

 

 

Reference

Http://zh.wikipedia.org/wiki/Trie

Http://en.wikipedia.org/wiki/Trie
Http://en.wikipedia.org/wiki/Hash_trie

Http://en.wikipedia.org/wiki/Hash_array_mapped_trie

Http://www.topcoder.com/tc? Module = static & d1 = tutorials & D2 = usingtries

 

 

This article from the csdn blog, reproduced please indicate the source: http://blog.csdn.net/livelylittlefish/archive/2011/05/10/6410569.aspx

 

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.