Java parses HTML Tag, java parses tag
Import java. util. hashMap; import java. util. map; public class TagParser {private Integer index = 0; private char [] tagChar; private int position = 0;/* parse symbol */private char symbol = '"'; public static final String START_SCRIPT = "<script"; public static final String END_SCRIPT = ">"; public static final String END_SCRIPT_1 = "/> "; public static final String EQ = "="; public static final char SPACE = ''; publi C static final String MUST_SPACE = ""; public static final String TAB = ""; public static final String SYMBOL = "'"; private DFAStatus status; private Map <Integer, entity> map = new HashMap <Integer, Entity> (); private Map <String, String> result = new HashMap <String, String> (); public static void main (String [] args) throws SymbolError {String tag = "<script filter = \" a \ '\ "type = \" text/javascript \ "id = 'no De 'src = \ "http://www.test.com/abc.js\" async = \ "true \"/> "; int I = 0; long start = System. currentTimeMillis (); while (I <10) {TagParser token = new TagParser (tag); token. parser (); System. out. println (token. getAttr ("src"); I ++; System. out. println (token);} System. out. println ("use time:" + (System. currentTimeMillis ()-start);} private boolean startsWith (String str) {char [] chat = str. toCharArray (); if (posi Tion + chat. length> tagChar. length) {return false;} for (int I = 0; I <chat. length; I ++) {if (tagChar [position + I]! = Chat [I]) {if (is_az (chat [I]) {if (tagChar [position + I] = chat [I]-32) {continue ;}} return false ;}} return true;} public void parser () throws SymbolError {if (status = null) {status = DFAStatus. UNSTART; skipSpace ();} if (status = DFAStatus. UNSTART) {if (startsWith (START_SCRIPT) {position + = START_SCRIPT.length (); status = DFAStatus. START; parser () ;}else {throw new SymbolError ("syntax error:" + tagChar [position]) ;}} else if (status = DFAStatus. START) {nextSpace (); parser ();} else if (status = DFAStatus. NULL) {skipSpace (); if (startsWith (END_SCRIPT_1) {status = DFAStatus. DONE; done (); return;} else if (startsWith (END_SCRIPT) {status = DFAStatus. DONE; done (); return;} parserName (); parser ();} else if (status = DFAStatus. EQ) {parserVal (); parser ();} else if (status = DFAStatus. SYMBOL_END) {status = DFAStatus. NULL; parser () ;}} private void done () {for (Entity entity: map. values () {result. put (entity. name, entity. value) ;}} private void parserVal () throws SymbolError {skipSpace (); StringBuilder builder = new StringBuilder (); int startIndex = position; for (int I = position; I <tagChar. length; I ++) {if (I = startIndex) {if (tagChar [I] = '\ ''| tagChar [I] = '"') {symbol = tagChar [I]; status = DFAStatus. SYMBOL_START; position ++;} else {throw new SymbolError ("syntax error:" + tagChar [position]) ;}} else {if (tagChar [I] = symbol) {status = DFAStatus. SYMBOL_END; position ++; break;} else {builder. append (tagChar [I]); position ++ ;}} map. get (index ). value = builder. toString (); index ++;} private boolean is_AZ (char chat) {return chat >=65 & chat <= 90;} private boolean is_az (char chat) {return chat> = 97 & chat <= 122;} private void parserName () throws SymbolError {StringBuilder builder = new StringBuilder (); for (int I = position; I <tagChar. length; I ++) {if (is_az (tagChar [I]) | is_AZ (tagChar [I]) {builder. append (tagChar [I]); position ++;} else {if (builder. length ()> 0) {skipSpace (); nextEQ (); break ;}} if (builder. toString (). length () = 0) {throw new SymbolError ("syntax error:" + tagChar [position]);} map. put (index, new Entity (builder. toString ();} private void nextEQ () throws SymbolError {if (startsWith (EQ) {position ++; status = DFAStatus. EQ;} else {throw new SymbolError ("syntax error:" + tagChar [position]);} private void skipSpace () {for (int I = position; I <tagChar. length; I ++) {if (tagChar [I] = SPACE | tagChar [I] = '\ t') {position ++ ;} else {return ;}} private void nextSpace () throws SymbolError {if (startsWith (MUST_SPACE) {position + = MUST_SPACE.length (); status = DFAStatus. NULL;} else if (startsWith (TAB) {position + = TAB. length (); status = DFAStatus. NULL;} else {throw new SymbolError ("syntax error:" + tagChar [position]);} public String getAttr (String name) {return result. get (name);} public TagParser (String str) {this. tagChar = str. toCharArray () ;}@ Overridepublic String toString () {return result. toString ();} public static class SymbolError extends Exception {private static final long serialVersionUID = 2441411373778495898L; public SymbolError (String msg) {super (msg );}} public static class Entity {public Entity (String name) {this. name = name;} public String name; public String value; @ Overridepublic String toString () {return "[" + name + ":" + value + "]" ;}} public enum DFAStatus {UNSTART, START, SYMBOL_START, SYMBOL_END, DONE, NULL, EQ }}
How does java parse html tags?
If it is too troublesome, there is a jar package dedicated to parsing html. The free jar package is called jsoup. You can search for any tag on a page and obtain no more than four codes.
JAVA parsing html
Find the source file and remove the html symbol. Let me show you what I wrote. It's not easy to write. I have to modify it (* ^__ ^ ...... :
Public String HtmlToTextGb2312 (String inputString)
{
String htmlStr = inputString; // String containing html tags
String textStr = "";
Pattern p_script;
Matcher m_script;
Pattern p_style;
Matcher m_style;
Pattern p_html;
Matcher m_html;
Pattern p_houhtml;
Matcher m_houhtml;
Pattern p_spe;
Matcher m_spe;
Pattern p_blank;
Matcher m_blank;
Pattern p_table;
Matcher m_table;
Pattern p_enter;
Matcher m_enter;
Try {
String regEx_script = "<[\ s] *? Script [^>] *?> [\ S \ S] *? <[\ S] *? \/[\ S] *? Script [\ s] *?> ";
// Define the regular expression of the script.
String regEx_style = "<[\ s] *? Style [^>] *?> [\ S \ S] *? <[\ S] *? \/[\ S] *? Style [\ s] *?> ";
// Define the regular expression of the style.
String regEx_html = "<[^>] +> ";
// Define the regular expression of the HTML Tag
String regEx_houhtml = "/[^>] +> ";
// Define the regular expression of the HTML Tag
String regEx_spe = "\ & [^;] +; & qu ...... the remaining full text>