Parse HTML page: Trial of HTML Parser

Source: Internet
Author: User

Author: Scud (Xiaoxia Flying Cloud) http://www.jscud.com reprint please indicate the author source. Otherwise please do not reprint, thank you.

Recently, we have been studying full-text retrieval of Lucene. In many cases, we need to parse or analyze HTML content or HTML pages. The Lucene DEMO also provides an HTML Parser, but it is not a pure Java solution. so I searched everywhere and found an "htmlparser" on the Internet ".

URL: http://htmlparser.sourceforge.net, current version 1.5.

Download it and try it out. It feels good and can fully meet Lucene's HTML parsing needs.

In a few days, we will post Lucene code for full-text search (to search articles on this site, etc ).

The trial code is as follows for your reference:

Package com. jscud. test;

Import java. Io. bufferedreader;
Import java. Io. file;
Import java. Io. fileinputstream;
Import java. Io. inputstreamreader;

Import org.html parser. node;
Import org.html parser. nodefilter;
Import org.html parser. parser;
Import org.html parser. Filters. nodeclassfilter;
Import org.html parser. Filters. orfilter;
Import org.html parser. nodes. textnode;
Import org.html parser. tags. linktag;
Import org.html parser. util. nodelist;
Import org.html parser. util. parserexception;
Import org.html parser. Visitors. htmlpage;
Import org.html parser. Visitors. textextractingvisitor;

Import com. jscud. util. logman; // a log record class

/**
* Demonstrates the application of HTML parse.
*
* @ Author Scud http://www.jscud.com
*/

Public class parsehtmltest
{

Public static void main (string [] ARGs) throws exception
{
String afile = "E:/jscud/temp/test.htm ";

String content = readtextfile (afile, "GBK ");

Test1 (content );
System. out. println ("======================================" );

Test2 (content );
System. out. println ("======================================" );

Test3 (content );
System. out. println ("======================================" );

Test4 (content );
System. out. println ("======================================" );

Test5 (afile );
System. out. println ("======================================" );

// Access external resources, relatively slow
Test5 ("http://www.jscud.com ");
System. out. println ("======================================" );

}

/**
* Analyze the content by reading the file.
* Filepath can also be a URL.
*
* @ Param resource file/URL
*/
Public static void test5 (string resource) throws exception
{
Parser myparser = new Parser (Resource );

// Set the Encoding
Myparser. setencoding ("GBK ");

Htmlpage visitor = new htmlpage (myparser );

Myparser. visitallnodeswith (visitor );

String textinpage = visitor. gettitle ();

System. Out. println (textinpage );
}

/**
* Processing by page. This method is recommended for a standard HTML page.
*/
Public static void test4 (string content) throws exception
{
Parser myparser;
Myparser = parser. createparser (content, "GBK ");

Htmlpage visitor = new htmlpage (myparser );

Myparser. visitallnodeswith (visitor );

String textinpage = visitor. gettitle ();

System. Out. println (textinpage );
}

/**
* Parse HTML pages in visitor mode.
*
* Minor advantages: translated <> and other symbols
* Disadvantage: A lot of spaces, cannot extract Link
*
*/
Public static void test3 (string content) throws exception
{
Parser myparser;
Myparser = parser. createparser (content, "GBK ");

Textextractingvisitor visitor = new textractingvisitor ();

Myparser. visitallnodeswith (visitor );

String textinpage = visitor. getextractedtext ();

System. Out. println (textinpage );
}

/**
* Get the plain text and link content.
*
* Filter conditions are used.
*/
Public static void Test2 (string content) throws parserexception
{
Parser myparser;
Nodelist = NULL;

Myparser = parser. createparser (content, "GBK ");

Nodefilter textfilter = new nodeclassfilter (textnode. Class );
Nodefilter linkfilter = new nodeclassfilter (linktag. Class );

// Do not process meta temporarily
// Nodefilter metafilter = new nodeclassfilter (metatag. Class );

Orfilter lastfilter = new orfilter ();
Lastfilter. setpredicates (New nodefilter [] {textfilter, linkfilter });

Nodelist = myparser. parse (lastfilter );

Node [] nodes = nodelist. tonodearray ();

For (INT I = 0; I <nodes. length; I ++)
{
Node anode = (node) nodes [I];

String line = "";
If (anode instanceof textnode)
{
Textnode = (textnode) Anode;
// Line = textnode. toplaintextstring (). Trim ();
Line = textnode. gettext ();
}
Else if (anode instanceof linktag)
{
Linktag linknode = (linktag) Anode;

Line = linknode. getlink ();
// @ Todo: You can implement this function by yourself.
// Line = stringfunc. Replace (line, "<%. * %> ","");
}

If (istrimempty (line ))
Continue;

System. Out. println (line );
}
}

/**
* Parse common text nodes.
*
* @ Param content
* @ Throws parserexception
*/
Public static void test1 (string content) throws parserexception
{
Parser myparser;
Node [] nodes = NULL;

Myparser = parser. createparser (content, null );

Nodes = myparser. extractallnodesthatare (textnode. Class); // exception cocould be thrown here

For (INT I = 0; I <nodes. length; I ++)
{
Textnode = (textnode) nodes [I];
String line = textnode. toplaintextstring (). Trim ();
If (line. Equals (""))
Continue;
System. Out. println (line );
}

}

/**
* Read an object to a string.
*
* @ Param sfilename file name
* @ Param sencode string
* @ Return File Content
*/
Public static string readtextfile (string sfilename, string sencode)
{
Stringbuffer sbstr = new stringbuffer ();

Try
{
File FF = new file (sfilename );
Inputstreamreader READ = new inputstreamreader (New fileinputstream (ff ),
Sencode );
Bufferedreader ins = new bufferedreader (read );

String dataline = "";
While (null! = (Dataline = ins. Readline ()))
{
Sbstr. append (dataline );
Sbstr. append ("/R/N ");
}

INS. Close ();
}
Catch (exception E)
{
Logman. Error ("Read text file error", e );
}

Return sbstr. tostring ();
}

/**
* Whether the string is empty after the Left and Right spaces are removed
* @ Param astr string
* @ Return Boolean
*/
Public static Boolean istrimempty (string astr)
{
If (null = astr) | (astr. Length () = 0 ))
{
Return true;
}
If (isblank (astr. Trim ()))
{
Return true;
}
Return false;
}

/**
* Whether the string is null: null or the length is 0.
* @ Param astr source string.
* @ Return Boolean
*/
Public static Boolean isblank (string astr)
{
If (null = astr) | (astr. Length () = 0 ))
{
Return true;
}
Else
{
Return false;
}
}

}

 

 

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.