This link is a bit of an introduction, you can learn about: http://blog.imaginea.com/mysql-query-parsing/
Key points:
1. SQL parsing consists of a parser and a lexical parser.
the easy way is to use Bison/flex combination. But MySQL's lexical analyzer is hand-crafted.
The parser 's entry function is Mysqlparse, and the lexical parser's entry function is Mysqllex.
2. The lexical analysis will check if token is a keyword.
The most straightforward way to do this is to get a large key word group, binary. MySQL has done some optimizations here.
This article is mainly about this part.
Given that the keyword is a read-only list, making a read-only lookup tree on it can improve the performance of the lookup.
To generate a Find tree:
1. Read the key word group and produce a trie tree.
2. Adjust the tree and produce an array (i.e. a tree that is not represented by a linked list).
Using the Find tree:
This is relatively simple, look directly at the function Get_hash_symbol good.
Generate a find tree with related makefile rules:
In ' Sql/cmakefiles/sql.dir/build.make ':
Sql/lex_hash.h:sql/gen_lex_hash
$ (Cmake_command)-e cmake_progress_report/home/zedware/workspace/mysql/cmakefiles $ (cmake_progress_153)
@$ (Cmake_command)-e cmake_echo_color--switch=$ (color)--blue--bold "Generating lex_hash.h"
Cd/home/zedware/workspace/mysql/sql &&/gen_lex_hash > Lex_hash.h
It is easy to find that the main function is ' Get_hash_symbol ', and its main calling relationship is:
/* SQL/LEX_HASH.H */
Get_hash_symbol->sql_functions_map
Get_hash_symbol->symbols_map
/* sql/sql_lex.cc */
Find_keyword->get_hash_symbol
Is_keyword->get_hash_symbol
Is_lex_native_function->get_hash_symbol
Example of a tree in the file "gen_lex_hash.cc" Comment:
+-----------+-+-+-+
| Len |1|2|3|
+-----------+-+-+-+
|first_char |0|0|a|
|last_char |0|0|d|
|link |0|0|+|
|
V
+----------+-+-+-+--+
| 1 Char|a|b|c|d |
+----------+-+-+-+--+
|first_char|d|0|0|0 |
|last_char |n|0|0|-1|
|link |+|0|0|+ |
| |
| V
| SYMBOLS[2] ("Day")
V
+----------+--+-+-+-+-+-+-+-+-+-+--+
| 2 Char|d |e|f|j|h|i|j|k|l|m|n |
+----------+--+-+-+-+-+-+-+-+-+-+--+
|first_char|0 |0|0|0|0|0|0|0|0|0|0 |
|last_char |-1|0|0|0|0|0|0|0|0|0|-1|
|link |+ |0|0|0|0|0|0|0|0|0|+ |
| |
V V
Symbols[0] ("ADD") symbols[1] ("and")
If you remember trie trees, it would be easier to understand them. The following is a tree corresponding to the different input arrays.
I=0
+-----------+-+--+
| Len |1| 2|
+-----------+-+--+
|first_char |0|-1|
|last_char |0| 0|
|char_tails |0| X|
|ithis |0| 0|
|iresult |0| 0|
|
&&
Static SYMBOL symbols[] = {
{"&&", SYM (And_and_sym)},
Static Uchar symbols_map[8]= {
0, 0, 1, 0, <=== 1 = = symbols[] The number of elements in the array, indicating no found
0, 0, 0, 0, <=== Symbols[0]
};
I=1
+-----------+--+--+
| Len | 1| 2|
+-----------+--+--+
|first_char |-1|-1|
|last_char | 0| 0|
|char_tails | X| X|
|ithis | 0| 0|
|iresult | 1| 0|
| |
< &&
Static SYMBOL symbols[] = {
{"&&", SYM (And_and_sym)},
{"<", SYM (LT)},
Static Uchar symbols_map[8]= {
0, 0, 1, 0, <=== 1 < symbols[] array of elements 2, 戫 find is symbols[1]
0, 0, 0, 0, <=== Symbols[0]
};
i=2
+-----------+--+--+
| Len | 1| 2|
+-----------+--+--+
|first_char |-1| &|
|last_char | 0| <|
|char_tails | X| ^|
|ithis | 0| 0|
|iresult | 1| X|
| |
< |
|
+----------+--+--+ +--+
| 1 char| &| |...| <|
+----------+--+--+ +--+
|first_char|-1| 0| |-1|
|last_char | 0| 0| | 0|
|char_tails| 0| 0| | X|
|ithis | 0| 0| | 0|
|iresult | 0| 0| | 2|
| |
&& <=
Static SYMBOL symbols[] = {
{"&&", SYM (And_and_sym)},
{"<", SYM (LT)},
{"<=", SYM (LE)},
Static Uchar symbols_map[100]= {
0, 0, 1, 0,
' & ', ' < ', 2, 0,
0, 0, 0, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 3, 0,
0, 0, 2, 0,
};
I=3
+-----------+--+--+
| Len | 1| 2|
+-----------+--+--+
|first_char |-1| &|
|last_char | 0| <|
|char_tails | X| ^|
|ithis | 0| 0|
|iresult | 1| X|
| |
< |
|
+----------+--+--+ +--+
| 1 char| &| |...| <|
+----------+--+--+ +--+
|first_char|-1| 0| |-1|
|last_char | 0| 0| | 0|
|char_tails| 0| 0| | X|
|ithis | 0| 0| | 0|
|iresult | 0| 0| | p|
| |
&& |
|
+----------+--+--+
| 2 char| =| >|
+----------+--+--+
|first_char|-1|-1|
|last_char | 0| 0|
|char_tails| X| X|
|ithis | 0| 0|
|iresult | 2| 3|
| |
<= <>
Static SYMBOL symbols[] = {
{"&&", SYM (And_and_sym)},
{"<", SYM (LT)},
{"<=", SYM (LE)},
{"<>", SYM (NE)},
Static Uchar symbols_map[108]= {
0, 0, 1, 0,
' & ', ' < ', 2, 0,
0, 0, 0, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
0, 0, 4, 0,
' = ', ' > ', 25, 0,
0, 0, 2, 0,
0, 0, 3, 0,
};
As you can see, there is some space waste in the array representation. If we are not afraid of trouble, we can also squeeze a little oil.