Implementation code for a simple lexical analyzer (Java implementation) _

Implementation code for a simple lexical analyzer (Java implementation) __java

Last Update:2018-07-28 Source: Internet

Author: User

Tags lexer stringbuffer

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Http://www.cnblogs.com/xuqiang/archive/2010/09/21/1953501.html

Main.java

* * * Main program/import java.io.*; Import lexer.*; public class Main {public static void main (string[] args) throws IOException {Lexer Lexer = new Lexer (); while (lexer.ge Treaderstate () = = False) {Lexer.scan ();}/* Save relevant information/Lexer.savetokens (); Lexer.savesymbolstable (); } }

Lexer.java

Package lexer; Import java.io.*; Import java.util.*; Import symbols.*; public class Lexer {public static int line = 1;/* Record line number */char peek = ';/* Next read character/hashtable<string, word> W ords = new hashtable<string, word> (); /* Symbol table */private hashtable<token, string> table = new Hashtable<token, string> (); /* Token sequence */private list<string> tokens = new linkedlist<string> (); /* Read File variable * * BufferedReader reader = null; /* Save whether the current read to the end of the file/private Boolean isend = false; /* Read to the end of the file/public Boolean getreaderstate () {return this.isend}/* Save the */public void savesymbolstable () stored in the table Throws IOException {FileWriter writer = new FileWriter ("symbol table. txt"); Writer.write ("[Symbol] [Symbol type information]\n"); Writer.write ("\ r \ n" ); Enumeration<token> e = Table.keys (); while (E.hasmoreelements ()) {Token Token = (Token) e.nextelement (); String desc = table.get (token); /* Write File * * Writer.write (token + "\t\t\t" + desc + "\ r \ n"); } writer.flush (); }/* Save tokens */Publicvoid Savetokens () throws IOException {FileWriter writer = new FileWriter ("tokens table. txt"); Writer.write ("[symbol] \ n"); writer. Write ("\ r \ n"); for (int i = 0; i < tokens.size (); ++i) {String tok = (String) tokens.get (i);/* Write File/writer.write (tok + "\ r \ n");} W Riter.flush (); } void Reserve (Word W) {Words.put (W.lexme, w);}/* constructor add keyword and type to hashtable words/public Lexer () {* * Initialize read file variable * * try {reader = new BufferedReader (new FileReader ("input. txt");} catch (IOException e) {System.out.print (e);}/* keyword */th Is.reserve (New Word ("If", Tag.if)); This.reserve (New Word ("then", Tag.then)); This.reserve (New Word ("Else", Tag.else)); This.reserve (New Word ("while", Tag.while)); This.reserve (New Word ("Do", tag.do)); /* Type * * This.reserve (word.true); This.reserve (Word.false); This.reserve (Type.int); This.reserve (Type.char); This.reserve (Type.bool); This.reserve (type.float);end = true; }//Peek = (char) System.in.read (); Public Boolean readch (char ch) throws IOException {READCH (), if (This.peek!= ch) {return false;} This.peek = '; re Turn true; Public Token Scan () throws IOException {/* eliminate whitespace */for (;; Readch ()) {if (peek = = ' | | | = PEEK = = ' t ') continue; else if (peek = = ' \ n ') line = line + 1; else break; /* Below start to split keywords, identifiers and other information * * Switch (PEEK) {/* for = =, >=, <=,!= distinction using the state machine to achieve * * case ' = ': if (readch (' = ')) {Tokens.add ( "=="); return word.eq; else {tokens.add ("="); return the new Token (' = ');} case ' > ': if (readch (' = ')) {Tokens.add (">="); else {tokens.add (">"), Return new Token (' > '), Case ' < ': if (readch (' = ')) {Tokens.add ("<="), return Word. Le else {Tokens.add ("<"); return to New Token (' < ');} case '! ': if (readch (' = ')) {Tokens.add ("!="); else {tokens.add ("!"); return new Token ('! ');} /* The following is the recognition of the number, according to the grammatical rules, here the * number as long as it can recognize the integer on the line. */if (Character.isdigit (peek)) {int Value = 0; Do {value = ten * value + character.digit (Peek, ten); Readch (); while (Character.isdigit (Peek)); num n = new num (value); Tokens.add (N.tostring ()); Table.put (N, "Num"); return n; }/* keyword or identifier identification */if (Character.isletter (Peek)) {StringBuffer sb = new StringBuffer ();/* First get the entire partition/do {Sb.appe nd (PEEK); READCH (); while (Character.isletterordigit (Peek)); /* judgment is the keyword or identifier */String s = sb.tostring (); Word w = (word) words.get (s); /* If it is a keyword or type, w should not be empty */if (w!= null) {//Table.put (W, "KeyWord or type"); Tokens.add (w.tostring ()); return w;/* description is keyword or type name * * * Otherwise it is an identifier ID/w = new Word (s, tag.id); Tokens.add (W.tostring ()); Table.put (W, "id"); Words.put (S, W); Return w; /* Any character in Peek is considered to be a lexical unit return */Token tok = new Token (peek); Table.put (Tok, "Token or seprator"); if (int) Peek!= 0xffff) Tokens.add (tok.tostring ()); Peek = '; return Tok; } }

Num.java

Package lexer; public class Num extends token{the public final int value, public Num (int v) {super (tag.num), this.value = v} public strin G ToString () {return "" + Value;}}

Tag.java

Package lexer; public class Tag {public final static int/= 256, BASIC = 257, break = 258, do = 259, ELSE =?, EQ = 261,/* = = =/F Alse = 262, GE = 263, ID = 264, IF = 265, INDEX = 266, LE = 267, minus = 268, NE = MB, NUM = 270, or = 271, real = 272, T EMP = 273, TRUE = 274, while = 275,/* after adding */THEN = 276; }

Token.java

Package lexer; public class Token {public final int tag; Token (int t) {this.tag = t;} public String toString () {return "" + (c) HAR) tag; public static void Main (string[] args) {Token tok = new Token (' a '); System.out.println (Tok); } }

Word.java

/* Class word is used to manage reserved words, identifiers, and compound word elements such as &&. * * Package lexer; public class Word extends Token {public String lexme = ""; Public Word (String s, int t) {super (t); this.lexme = S.} public String toString () {return this.lexme;} public static Final word and = new Word ("&&", tag.and), or = new Word ("| |", tag.or), eq = new Word ("= =", tag.eq), NE = new word ("!=", tag.ne), le = new Word ("<=", tag.le), GE = new Word (">=", tag.ge), minus = new word ("minus", Tag.minus), True = new Word ("true", tag.true), False = new Word ("False", tag.false), temp = new Word ("T", tag.temp); }

Type.java

* * Description Data type * * Package symbols; Import lexer.*; public class type extends word{the public type (String s, int tag) {Super (S, tag);} The public static final Type int = new Type ( "int", tag.basic), Float = new Type ("float", tag.basic), Char = new Type ("char", tag.basic), Bool = new Type ("bool", Tag. BASIC); }

============

http://freewxy.iteye.com/blog/870016

What is lexical.

The so-called lexical, the source code is composed of character streams, which include keywords, variable names, method names, parentheses, and so on, where variable names can not contain punctuation, cannot begin with numbers with letters, and so on, which is lexical;

What is lexical analysis.

The lexical analysis phase is the first phase of the compilation process. The task at this stage is to read the source program from left to right, one character at a time, i.e. to scan the stream of characters that compose the source program and recognize the word (also known as a word symbol or symbol) according to the word-formation rule.

Lexical of the simple language to be analyzed:

1) Key Words

Begin if then while doing end

2 Operators and bounds

: = +-*/< <= > >= <> =; ( ) #

3 other words are identifiers (IDs) and reshaping constants (NUM), defined by the following formal definitions:

Id=letter (letter|digit) *

num=digitdigit*

4 spaces are made up of blanks, tabs, and line breaks. Spaces are generally used to separate IDs, NUM, operators, bounds, and keywords, and the lexical analysis phase is often ignored.

Category encoding for various word symbols

Word symbol	Category Code	Word symbol	Category Code
Begin	1	:	17
If	2	:=	18
Then	3	<	20
While	4	<>	21st
Todo	5	<=	22
End	6	>	23
Letter (Letter\|digit) *	10	>=	24
digitdigit*	11	=	25
+	13	;	26
-	14	(	27
*	15	)	28
/	16	#	0

The function of the lexical analysis program:

Input: Source program string for the given grammar

Output: A sequence consisting of two tuples (SYN, token, or sum).

SYN is the word category code;

Token for the stored word itself string;

Sum is the shaping constant.

For example: The source program begin X:=9;if X>0 then x:=2*x+1/3;end# after lexical analysis output the following sequence: (1,begin) ("X") (18,:=) (11,9) (;) (2,if) ...

Flow chart:

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More