Introduction
Just say no, fake tricks.
In this section, we will conduct an experiment and use the C language to implement a simple lexical analyzer to deepen our understanding of lexical analysis. If you are interested, analyze the source code by yourself. It is quite simple and there is no flow chart. Please forgive me. Let's start.
4.1 experiment description
For example, for the source program:
Begin x: = 9: if x> 9 then x: = 2 * x + 1/3; end #
After lexical analysis, the following sequence is output:
<1, begin> <10, x> <18, :=> <11,9> <26, ;>< 2, if> ......
4.1.1 simple lexical analysis
(1) KEYWORDS:
Begin if then while do end
All keywords are in lowercase.
(2) operators and operators
: = +-*/<= <>>>= ;()#
(3) other words are Identifiers (IDS) and Integer constants (SUM), which are defined by the following regular formula:
ID = letter (letter | digit )*
NUM = digit *
(4) spaces are composed of spaces, tabs, and line breaks. Spaces are generally used to separate IDS, SUM, operators, operators, and keywords.
4.1.2 types of codes corresponding to various word symbols:
Table 4.2.1 types of word symbols
Word symbols
Category Code
Word symbols
Category Code
Bgin
1
:
17
If
2
: =
18
Then
3
<
20
Wile
4
<>
21
Do
5
<=
22
End
6
>
23
Lettet (letter | digit )*
10
> =
24
Dight *
11
=
25
+
13
;
26
-
14
(
27
*
15
)
28
/
16
#
0
4.2 source code reference
[Html]
# Include <stdio. h>
# Include <string. h>
Char prog [80], token [8], ch;
Int syn, p, m, n, sum;
Char * rwtab [6] = {"begin", "if", "then", "while", "do", "end "};
Void scaner (void );
Main ()
{
P = 0;
Printf ("\ n please input a string (end with '#'): \ n ");
Do {
Scanf ("% c", & ch );
Prog [p ++] = ch;
} While (ch! = '#');
P = 0;
Do {
Scaner ();
Switch (syn)
{
Case 11:
Printf ("(%-10d % 5d) \ n", sum, syn );
Break;
Case-1:
Printf ("you have input a wrong string \ n ");
// Getch ();
Return 0;
Break;
Default:
Printf ("(%-10 s % 5d) \ n", token, syn );
Break;
}
} While (syn! = 0 );
// Getch ();
}
Void scaner (void)
{
Sum = 0;
For (m = 0; m <8; m ++)
Token [m ++] = NULL;
Ch = prog [p ++];
M = 0;
While (ch = '') | (ch = '\ n '))
Ch = prog [p ++];
If (ch <= 'Z') & (ch> = 'A') | (ch <= 'Z ') & (ch> = 'A ')))
{
While (ch <= 'Z') & (ch> = 'A') | (ch <= 'Z ') & (ch> = 'A') | (ch> = '0') & (ch <= '9 ')))
{
Token [m ++] = ch;
Ch = prog [p ++];
}
P --;
Syn = 10;
For (n = 0; n <6; n ++)
If (strcmp (token, rwtab [n]) = 0)
{
Syn = n + 1;
Break;
}
}
Else if (ch> = '0') & (ch <= '9 '))
{
While (ch> = '0') & (ch <= '9 '))
{
Sum = sum * 10 + ch-'0 ';
Ch = prog [p ++];
}
P --;
Syn = 11;
}
Else
{
Switch (ch)
{
Case '<':
Token [m ++] = ch;
Ch = prog [p ++];
If (ch = ')
{
Syn = 22;
Token [m ++] = ch;
}
Else
{
Syn = 20;
P --;
}
Break;
Case '> ':
Token [m ++] = ch;
Ch = prog [p ++];
If (ch = ')
{
Syn = 24;
Token [m ++] = ch;
}
Else
{
Syn = 23;
P --;
}
Break;
Case '+ ':
Token [m ++] = ch;
Ch = prog [p ++];
If (ch = '+ ')
{
Syn = 17;
Token [m ++] = ch;
}
Else
{
Syn = 13;
P --;
}
Break;
Case '-':
Token [m ++] = ch;
Ch = prog [p ++];
If (ch = '-')
{
Syn = 29;
Token [m ++] = ch;
}
Else
{
Syn = 14;
P --;
}
Break;
Case '! ':
Ch = prog [p ++];
If (ch = ')
{
Syn = 21;
Token [m ++] = ch;
}
Else
{
Syn = 31;
P --;
}
Break;
Case '= ':
Token [m ++] = ch;
Ch = prog [p ++];
If (ch = ')
{
Syn = 25;
Token [m ++] = ch;
}
Else
{
Syn = 18;
P --;
}
Break;
Case '*':
Syn = 15;
Token [m ++] = ch;
Break;
Case '/':
Syn = 16;
Token [m ++] = ch;
Break;
Case '(':
Syn = 27;
Token [m ++] = ch;
Break;
Case ')':
Syn = 28;
Token [m ++] = ch;
Break;
Case '{':
Syn = 5;
Token [m ++] = ch;
Break;
Case '}':
Syn = 6;
Token [m ++] = ch;
Break;
Case ';':
Syn = 26;
Token [m ++] = ch;
Break;
Case '\"':
Syn = 30;
Token [m ++] = ch;
Break;
Case '#':
Syn = 0;
Token [m ++] = ch;
Break;
Case ':':
Syn = 17;
Token [m ++] = ch;
Break;
Default:
Syn =-1;
Break;
}
}
Token [m ++] = '\ 0 ';
}
4.3 Summary:
Lexical analysis refers to reading a string from the source code sequence of the program cyclically, determining its attributes according to the lexical requirements, and then forming a lexical unit. For programming languages in reality, the lexical structure is complicated and generally expressed using regular expressions.
Author: rill_zhen