Package com. jscud. test; Import java. Io. bufferedreader; Import java. Io. file; Import java. Io. fileinputstream; Import java. Io. inputstreamreader; Import org.html parser. node; Import org.html parser. nodefilter; Import org.html parser. parser; Import org.html parser. Filters. nodeclassfilter; Import org.html parser. Filters. orfilter; Import org.html parser. nodes. textnode; Import org.html parser. tags. linktag; Import org.html parser. util. nodelist; Import org.html parser. util. parserexception; Import org.html parser. Visitors. htmlpage; Import org.html parser. Visitors. textextractingvisitor; Import com. jscud. util. logman; // a log record class /** * Demonstrates the application of HTML parse. * * @ Author Scud http://www.jscud.com */ Public class parsehtmltest { Public static void main (string [] ARGs) throws exception { String afile = "E:/jscud/temp/test.htm "; String content = readtextfile (afile, "GBK "); Test1 (content ); System. out. println ("======================================" ); Test2 (content ); System. out. println ("======================================" ); Test3 (content ); System. out. println ("======================================" ); Test4 (content ); System. out. println ("======================================" ); Test5 (afile ); System. out. println ("======================================" ); // Access external resources, relatively slow Test5 ("http://www.jscud.com "); System. out. println ("======================================" ); } /** * Analyze the content by reading the file. * Filepath can also be a URL. * * @ Param resource file/URL */ Public static void test5 (string resource) throws exception { Parser myparser = new Parser (Resource ); // Set the Encoding Myparser. setencoding ("GBK "); Htmlpage visitor = new htmlpage (myparser ); Myparser. visitallnodeswith (visitor ); String textinpage = visitor. gettitle (); System. Out. println (textinpage ); } /** * Processing by page. This method is recommended for a standard HTML page. */ Public static void test4 (string content) throws exception { Parser myparser; Myparser = parser. createparser (content, "GBK "); Htmlpage visitor = new htmlpage (myparser ); Myparser. visitallnodeswith (visitor ); String textinpage = visitor. gettitle (); System. Out. println (textinpage ); } /** * Parse HTML pages in visitor mode. * * Minor advantages: translated <> and other symbols * Disadvantage: A lot of spaces, cannot extract Link * */ Public static void test3 (string content) throws exception { Parser myparser; Myparser = parser. createparser (content, "GBK "); Textextractingvisitor visitor = new textractingvisitor (); Myparser. visitallnodeswith (visitor ); String textinpage = visitor. getextractedtext (); System. Out. println (textinpage ); } /** * Get the plain text and link content. * * Filter conditions are used. */ Public static void Test2 (string content) throws parserexception { Parser myparser; Nodelist = NULL; Myparser = parser. createparser (content, "GBK "); Nodefilter textfilter = new nodeclassfilter (textnode. Class ); Nodefilter linkfilter = new nodeclassfilter (linktag. Class ); // Do not process meta temporarily // Nodefilter metafilter = new nodeclassfilter (metatag. Class ); Orfilter lastfilter = new orfilter (); Lastfilter. setpredicates (New nodefilter [] {textfilter, linkfilter }); Nodelist = myparser. parse (lastfilter ); Node [] nodes = nodelist. tonodearray (); For (INT I = 0; I <nodes. length; I ++) { Node anode = (node) nodes [I]; String line = ""; If (anode instanceof textnode) { Textnode = (textnode) Anode; // Line = textnode. toplaintextstring (). Trim (); Line = textnode. gettext (); } Else if (anode instanceof linktag) { Linktag linknode = (linktag) Anode; Line = linknode. getlink (); // @ Todo: You can implement this function by yourself. // Line = stringfunc. Replace (line, "<%. * %> ",""); } If (istrimempty (line )) Continue; System. Out. println (line ); } } /** * Parse common text nodes. * * @ Param content * @ Throws parserexception */ Public static void test1 (string content) throws parserexception { Parser myparser; Node [] nodes = NULL; Myparser = parser. createparser (content, null ); Nodes = myparser. extractallnodesthatare (textnode. Class); // exception cocould be thrown here For (INT I = 0; I <nodes. length; I ++) { Textnode = (textnode) nodes [I]; String line = textnode. toplaintextstring (). Trim (); If (line. Equals ("")) Continue; System. Out. println (line ); } } /** * Read an object to a string. * * @ Param sfilename file name * @ Param sencode string * @ Return File Content */ Public static string readtextfile (string sfilename, string sencode) { Stringbuffer sbstr = new stringbuffer (); Try { File FF = new file (sfilename ); Inputstreamreader READ = new inputstreamreader (New fileinputstream (ff ), Sencode ); Bufferedreader ins = new bufferedreader (read ); String dataline = ""; While (null! = (Dataline = ins. Readline ())) { Sbstr. append (dataline ); Sbstr. append ("/R/N "); } INS. Close (); } Catch (exception E) { Logman. Error ("Read text file error", e ); } Return sbstr. tostring (); } /** * Whether the string is empty after the Left and Right spaces are removed * @ Param astr string * @ Return Boolean */ Public static Boolean istrimempty (string astr) { If (null = astr) | (astr. Length () = 0 )) { Return true; } If (isblank (astr. Trim ())) { Return true; } Return false; } /** * Whether the string is null: null or the length is 0. * @ Param astr source string. * @ Return Boolean */ Public static Boolean isblank (string astr) { If (null = astr) | (astr. Length () = 0 )) { Return true; } Else { Return false; } } } |