Page
The recent study of Lucene Full-text search, in many places need to parse or analyze HTML content or HTML page, Lucene itself's demo program also provides an HTML Parser, but not a pure Java solution. So I searched everywhere and found a " Htmlparser ".
The URL is: http://htmlparser.sourceforge.net , the current version is 1.5.
Download down, try it, feel good, completely meet the needs of Lucene parsing HTML.
In a few days to paste the code of Lucene Full-text search. (Retrieve the article of this website etc.).
The trial code is as follows, for your reference:
Package com.jscud.test;
Import Java.io.BufferedReader;
Import Java.io.File;
Import Java.io.FileInputStream;
Import Java.io.InputStreamReader;
Import Org.htmlparser.Node;
Import Org.htmlparser.NodeFilter;
Import Org.htmlparser.Parser;
Import Org.htmlparser.filters.NodeClassFilter;
Import Org.htmlparser.filters.OrFilter;
Import Org.htmlparser.nodes.TextNode;
Import Org.htmlparser.tags.LinkTag;
Import org.htmlparser.util.NodeList;
Import org.htmlparser.util.ParserException;
Import Org.htmlparser.visitors.HtmlPage;
Import Org.htmlparser.visitors.TextExtractingVisitor;
Import Com.jscud.util.LogMan; A log record class
/**
* Demonstrates the application of HTML parse.
*
* @author Scud http://www.jscud.com
*/
public class Parsehtmltest
{
public static void Main (string[] args) throws Exception
{
String afile = "e:/jscud/temp/test.htm";
String content = ReadTextFile (afile, "GBK");
Test1 (content);
System.out.println ("====================================");
Test2 (content);
System.out.println ("====================================");
Test3 (content);
System.out.println ("====================================");
TEST4 (content);
System.out.println ("====================================");
TEST5 (Afile);
System.out.println ("====================================");
Relatively slow access to external resources
Test5 ("http://www.jscud.com");
System.out.println ("====================================");
}
/**
* Read the file to analyze the content.
* FilePath can also be a URL.
*
* @param resource file/url
*/
public static void Test5 (String resource) throws Exception
{
Parser myparser = new Parser (Resource);
Set encoding
Myparser.setencoding ("GBK");
HtmlPage visitor = new HtmlPage (myparser);
Myparser.visitallnodeswith (visitor);
String textinpage = Visitor.gettitle ();
System.out.println (Textinpage);
}
/**
* Processed in page style. This is recommended for a standard HTML page.
*/
public static void Test4 (String content) throws Exception
{
Parser Myparser;
Myparser = parser.createparser (content, "GBK");
HtmlPage visitor = new HtmlPage (myparser);
Myparser.visitallnodeswith (visitor);
String textinpage = Visitor.gettitle ();
System.out.println (Textinpage);
}
/**
* Use visitor mode to parse HTML pages.
*
* Small Advantages: translation of <> symbols
* Disadvantage: A lot of space, can not extract link
*
*/
public static void Test3 (String content) throws Exception
{
Parser Myparser;
Myparser = parser.createparser (content, "GBK");
Textextractingvisitor visitor = new Textextractingvisitor ();
Myparser.visitallnodeswith (visitor);
String textinpage = Visitor.getextractedtext ();
System.out.println (Textinpage);
}
/**
* Get regular text and links to content.
*
* The filter condition is used.
*/
public static void Test2 (String content) throws Parserexception
{
Parser Myparser;
NodeList nodelist = null;
Myparser = parser.createparser (content, "GBK");
Nodefilter textfilter = new Nodeclassfilter (textnode.class);
Nodefilter linkfilter = new Nodeclassfilter (linktag.class);
The meta is not processed for the time being
Nodefilter MetaFilter = new Nodeclassfilter (metatag.class);
Orfilter lastfilter = new Orfilter ();
Lastfilter.setpredicates (new nodefilter[] {textfilter, linkfilter});
NodeList = Myparser.parse (Lastfilter);
node[] nodes = Nodelist.tonodearray ();
for (int i = 0; i < nodes.length; i++)
{
Node anode = (node) nodes[i];
String line = "";
if (anode instanceof Textnode)
{
Textnode Textnode = (textnode) anode;
line = Textnode.toplaintextstring (). Trim ();
line = Textnode.gettext ();
}
else if (anode instanceof Linktag)
{
Linktag Linknode = (linktag) anode;
line = Linknode.getlink ();
//@todo Filter JSP Tags: You can implement this function yourself
line = Stringfunc.replace (line, "<%.*%>", "");
}
if (Istrimempty (line))
Continue
System.out.println (line);
}
}
/**
* Parse normal text nodes.
*
* @param content
* @throws parserexception
*/
public static void Test1 (String content) throws Parserexception
{
Parser Myparser;
node[] nodes = null;
Myparser = parser.createparser (content, NULL);
nodes = Myparser.extractallnodesthatare (Textnode.class); Exception could be thrown here
for (int i = 0; i < nodes.length; i++)
{
Textnode Textnode = (textnode) nodes[i];
String line = textnode.toplaintextstring (). Trim ();
if (Line.equals (""))
Continue
System.out.println (line);
}
}
/**
* Reads a file into the string.
*
* @param sfilename file name
* @ param sencode String
* @return File contents
*/
public static string ReadTextFile (String sfilename, String sencode)
{
StringBuffer sbstr = new StringBuffer ();
try
{
file FF = new file (sfilename);
InputStreamReader Read = new InputStreamReader (new FileInputStream (FF),
Sencode);
BufferedReader ins = new BufferedReader (read );
String dataline = "";
while (null!= (Dataline = Ins.readline ()))
{
sbstr.append (dataline);
sbstr.append ("\ r \ n ");
}
Ins.close ();
}
catch (Exception e)
{
Logman.error ("Read Text File error", E);
}
return sbstr.tostring ();
}
/**
* To remove the left and right space after the string is empty
* @param astr String
* @return Boolean
*/
Public Static Boolean Istrimempty (String astr)
{
if ( NULL = = Astr) | | (astr.length () = 0))
{
return true;
}
if (IsBlank (Astr.trim ())
{
return true;
}
return false;
}
/**
* Whether the string is empty: null or length 0.
* @param astr source string.
* @return Boolean
*/
public static Boolean IsBlank (String astr)
{
if (null = ASTR) | | (astr.length () = 0))
{
return true;
}
Else
{
return false;
}
}
}