Implementation code for a simple lexical analyzer (Java implementation) __java

Source: Internet
Author: User
Tags lexer stringbuffer

Http://www.cnblogs.com/xuqiang/archive/2010/09/21/1953501.html

Main.java

* * * Main program/import java.io.*; Import lexer.*; public class Main {public static void main (string[] args) throws IOException {Lexer Lexer = new Lexer (); while (lexer.ge Treaderstate () = = False) {Lexer.scan ();}/* Save relevant information/Lexer.savetokens (); Lexer.savesymbolstable (); } }

Lexer.java

Package lexer; Import java.io.*; Import java.util.*; Import symbols.*; public class Lexer {public static int line = 1;/* Record line number */char peek = ';/* Next read character/hashtable<string, word> W ords = new hashtable<string, word> (); /* Symbol table */private hashtable<token, string> table = new Hashtable<token, string> (); /* Token sequence */private list<string> tokens = new linkedlist<string> (); /* Read File variable * * BufferedReader reader = null; /* Save whether the current read to the end of the file/private Boolean isend = false; /* Read to the end of the file/public Boolean getreaderstate () {return this.isend}/* Save the */public void savesymbolstable () stored in the table Throws IOException {FileWriter writer = new FileWriter ("symbol table. txt"); Writer.write ("[Symbol] [Symbol type information]\n"); Writer.write ("\ r \ n" ); Enumeration<token> e = Table.keys (); while (E.hasmoreelements ()) {Token Token = (Token) e.nextelement (); String desc = table.get (token); /* Write File * * Writer.write (token + "\t\t\t" + desc + "\ r \ n"); } writer.flush (); }/* Save tokens */Publicvoid Savetokens () throws IOException {FileWriter writer = new FileWriter ("tokens table. txt"); Writer.write ("[symbol] \ n"); writer. Write ("\ r \ n"); for (int i = 0; i < tokens.size (); ++i) {String tok = (String) tokens.get (i);/* Write File/writer.write (tok + "\ r \ n");} W Riter.flush (); } void Reserve (Word W) {Words.put (W.lexme, w);}/* constructor add keyword and type to hashtable words/public Lexer () {* * Initialize read file variable * * try {reader = new BufferedReader (new FileReader ("input. txt");} catch (IOException e) {System.out.print (e);}/* keyword */th Is.reserve (New Word ("If", Tag.if)); This.reserve (New Word ("then", Tag.then)); This.reserve (New Word ("Else", Tag.else)); This.reserve (New Word ("while", Tag.while)); This.reserve (New Word ("Do", tag.do)); /* Type * * This.reserve (word.true); This.reserve (Word.false); This.reserve (Type.int); This.reserve (Type.char); This.reserve (Type.bool); This.reserve (type.float);end = true; }//Peek = (char) System.in.read (); Public Boolean readch (char ch) throws IOException {READCH (), if (This.peek!= ch) {return false;} This.peek = '; re Turn true; Public Token Scan () throws IOException {/* eliminate whitespace */for (;; Readch ()) {if (peek = = ' | | | = PEEK = = ' t ') continue; else if (peek = = ' \ n ') line = line + 1; else break; /* Below start to split keywords, identifiers and other information * * Switch (PEEK) {/* for = =, >=, <=,!= distinction using the state machine to achieve * * case ' = ': if (readch (' = ')) {Tokens.add ( "=="); return word.eq; else {tokens.add ("="); return the new Token (' = ');} case ' > ': if (readch (' = ')) {Tokens.add (">="); else {tokens.add (">"), Return new Token (' > '), Case ' < ': if (readch (' = ')) {Tokens.add ("<="), return Word. Le else {Tokens.add ("<"); return to New Token (' < ');} case '! ': if (readch (' = ')) {Tokens.add ("!="); else {tokens.add ("!"); return new Token ('! ');} /* The following is the recognition of the number, according to the grammatical rules, here the * number as long as it can recognize the integer on the line. */if (Character.isdigit (peek)) {int Value = 0; Do {value = ten * value + character.digit (Peek, ten); Readch (); while (Character.isdigit (Peek)); num n = new num (value); Tokens.add (N.tostring ()); Table.put (N, "Num"); return n; }/* keyword or identifier identification */if (Character.isletter (Peek)) {StringBuffer sb = new StringBuffer ();/* First get the entire partition/do {Sb.appe nd (PEEK); READCH (); while (Character.isletterordigit (Peek)); /* judgment is the keyword or identifier */String s = sb.tostring (); Word w = (word) words.get (s); /* If it is a keyword or type, w should not be empty */if (w!= null) {//Table.put (W, "KeyWord or type"); Tokens.add (w.tostring ()); return w;/* description is keyword or type name * * * Otherwise it is an identifier ID/w = new Word (s, tag.id); Tokens.add (W.tostring ()); Table.put (W, "id"); Words.put (S, W); Return w; /* Any character in Peek is considered to be a lexical unit return */Token tok = new Token (peek); Table.put (Tok, "Token or seprator"); if (int) Peek!= 0xffff) Tokens.add (tok.tostring ()); Peek = '; return Tok; } }

Num.java

Package lexer; public class Num extends token{the public final int value, public Num (int v) {super (tag.num), this.value = v} public strin G ToString () {return "" + Value;}}

Tag.java

Package lexer; public class Tag {public final static int/= 256, BASIC = 257, break = 258, do = 259, ELSE =?, EQ = 261,/* = = =/F Alse = 262, GE = 263, ID = 264, IF = 265, INDEX = 266, LE = 267, minus = 268, NE = MB, NUM = 270, or = 271, real = 272, T EMP = 273, TRUE = 274, while = 275,/* after adding */THEN = 276; }

Token.java

Package lexer; public class Token {public final int tag; Token (int t) {this.tag = t;} public String toString () {return "" + (c) HAR) tag; public static void Main (string[] args) {Token tok = new Token (' a '); System.out.println (Tok); } }

Word.java

/* Class word is used to manage reserved words, identifiers, and compound word elements such as &&. * * Package lexer; public class Word extends Token {public String lexme = ""; Public Word (String s, int t) {super (t); this.lexme = S.} public String toString () {return this.lexme;} public static Final word and = new Word ("&&", tag.and), or = new Word ("| |", tag.or), eq = new Word ("= =", tag.eq), NE = new word ("!=", tag.ne), le = new Word ("<=", tag.le), GE = new Word (">=", tag.ge), minus = new word ("minus", Tag.minus), True = new Word ("true", tag.true), False = new Word ("False", tag.false), temp = new Word ("T", tag.temp); }

Type.java

* * Description Data type * * Package symbols; Import lexer.*; public class type extends word{the public type (String s, int tag) {Super (S, tag);} The public static final Type int = new Type ( "int", tag.basic), Float = new Type ("float", tag.basic), Char = new Type ("char", tag.basic), Bool = new Type ("bool", Tag. BASIC); }

============

http://freewxy.iteye.com/blog/870016

What is lexical.

The so-called lexical, the source code is composed of character streams, which include keywords, variable names, method names, parentheses, and so on, where variable names can not contain punctuation, cannot begin with numbers with letters, and so on, which is lexical;

What is lexical analysis.

The lexical analysis phase is the first phase of the compilation process. The task at this stage is to read the source program from left to right, one character at a time, i.e. to scan the stream of characters that compose the source program and recognize the word (also known as a word symbol or symbol) according to the word-formation rule.

Lexical of the simple language to be analyzed:

1) Key Words

Begin if then while doing end

2 Operators and bounds

: = +-*/< <= > >= <> =; ( ) #

3 other words are identifiers (IDs) and reshaping constants (NUM), defined by the following formal definitions:

Id=letter (letter|digit) *

num=digitdigit*

4 spaces are made up of blanks, tabs, and line breaks. Spaces are generally used to separate IDs, NUM, operators, bounds, and keywords, and the lexical analysis phase is often ignored.

Category encoding for various word symbols

Word symbol

Category Code

Word symbol

Category Code

Begin

1

:

17

If

2

:=

18

Then

3

<

20

While

4

<>

21st

Todo

5

<=

22

End

6

>

23

Letter (Letter|digit) *

10

>=

24

digitdigit*

11

=

25

+

13

;

26

-

14

(

27

*

15

)

28

/

16

#

0

The function of the lexical analysis program:

Input: Source program string for the given grammar

Output: A sequence consisting of two tuples (SYN, token, or sum).

SYN is the word category code;

Token for the stored word itself string;

Sum is the shaping constant.

For example: The source program begin X:=9;if X>0 then x:=2*x+1/3;end# after lexical analysis output the following sequence: (1,begin) ("X") (18,:=) (11,9) (;) (2,if) ...

Flow chart:


Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.