Parsing HTML pages: A trial of HTML parser

Source: Internet
Author: User
Tags filter continue empty html page resource string trim stringbuffer
Page

The recent study of Lucene Full-text search, in many places need to parse or analyze HTML content or HTML page, Lucene itself's demo program also provides an HTML Parser, but not a pure Java solution. So I searched everywhere and found a " Htmlparser ".

The URL is: http://htmlparser.sourceforge.net , the current version is 1.5.

Download down, try it, feel good, completely meet the needs of Lucene parsing HTML.

In a few days to paste the code of Lucene Full-text search. (Retrieve the article of this website etc.).

The trial code is as follows, for your reference:

Package com.jscud.test;

Import Java.io.BufferedReader;
Import Java.io.File;
Import Java.io.FileInputStream;
Import Java.io.InputStreamReader;

Import Org.htmlparser.Node;
Import Org.htmlparser.NodeFilter;
Import Org.htmlparser.Parser;
Import Org.htmlparser.filters.NodeClassFilter;
Import Org.htmlparser.filters.OrFilter;
Import Org.htmlparser.nodes.TextNode;
Import Org.htmlparser.tags.LinkTag;
Import org.htmlparser.util.NodeList;
Import org.htmlparser.util.ParserException;
Import Org.htmlparser.visitors.HtmlPage;
Import Org.htmlparser.visitors.TextExtractingVisitor;

Import Com.jscud.util.LogMan; A log record class

/**
* Demonstrates the application of HTML parse.
*
* @author Scud http://www.jscud.com
*/

public class Parsehtmltest
{

public static void Main (string[] args) throws Exception
{
String afile = "e:/jscud/temp/test.htm";

String content = ReadTextFile (afile, "GBK");

Test1 (content);
System.out.println ("====================================");

Test2 (content);
System.out.println ("====================================");

Test3 (content);
System.out.println ("====================================");

TEST4 (content);
System.out.println ("====================================");

TEST5 (Afile);
System.out.println ("====================================");

Relatively slow access to external resources
Test5 ("http://www.jscud.com");
System.out.println ("====================================");

}

/**
* Read the file to analyze the content.
* FilePath can also be a URL.
*
* @param resource file/url
*/
public static void Test5 (String resource) throws Exception
{
Parser myparser = new Parser (Resource);

Set encoding
Myparser.setencoding ("GBK");

HtmlPage visitor = new HtmlPage (myparser);

Myparser.visitallnodeswith (visitor);

String textinpage = Visitor.gettitle ();

System.out.println (Textinpage);
}

/**
* Processed in page style. This is recommended for a standard HTML page.
*/
public static void Test4 (String content) throws Exception
{
Parser Myparser;
Myparser = parser.createparser (content, "GBK");

HtmlPage visitor = new HtmlPage (myparser);

Myparser.visitallnodeswith (visitor);

String textinpage = Visitor.gettitle ();

System.out.println (Textinpage);
}

/**
* Use visitor mode to parse HTML pages.
*
* Small Advantages: translation of <> symbols
* Disadvantage: A lot of space, can not extract link
*
*/
public static void Test3 (String content) throws Exception
{
Parser Myparser;
Myparser = parser.createparser (content, "GBK");

Textextractingvisitor visitor = new Textextractingvisitor ();

Myparser.visitallnodeswith (visitor);

String textinpage = Visitor.getextractedtext ();

System.out.println (Textinpage);
}

/**
* Get regular text and links to content.
*
* The filter condition is used.
*/
public static void Test2 (String content) throws Parserexception
{
Parser Myparser;
NodeList nodelist = null;

Myparser = parser.createparser (content, "GBK");

Nodefilter textfilter = new Nodeclassfilter (textnode.class);
Nodefilter linkfilter = new Nodeclassfilter (linktag.class);

The meta is not processed for the time being
Nodefilter MetaFilter = new Nodeclassfilter (metatag.class);

Orfilter lastfilter = new Orfilter ();
Lastfilter.setpredicates (new nodefilter[] {textfilter, linkfilter});

NodeList = Myparser.parse (Lastfilter);

node[] nodes = Nodelist.tonodearray ();

for (int i = 0; i < nodes.length; i++)
{
Node anode = (node) nodes[i];

String line = "";
if (anode instanceof Textnode)
{
Textnode Textnode = (textnode) anode;
line = Textnode.toplaintextstring (). Trim ();
line = Textnode.gettext ();
}
else if (anode instanceof Linktag)
{
Linktag Linknode = (linktag) anode;

line = Linknode.getlink ();
//@todo Filter JSP Tags: You can implement this function yourself
line = Stringfunc.replace (line, "<%.*%>", "");
}

if (Istrimempty (line))
Continue

System.out.println (line);
}
}

/**
* Parse normal text nodes.
*
* @param content
* @throws parserexception
*/
public static void Test1 (String content) throws Parserexception
{
Parser Myparser;
node[] nodes = null;

Myparser = parser.createparser (content, NULL);

nodes = Myparser.extractallnodesthatare (Textnode.class); Exception could be thrown here

for (int i = 0; i < nodes.length; i++)
{
Textnode Textnode = (textnode) nodes[i];
String line = textnode.toplaintextstring (). Trim ();
if (Line.equals (""))
Continue
System.out.println (line);
}

}

   /**
     * Reads a file into the string.
     *
     * @param sfilename  file name
     * @ param sencode   String
     * @return File contents
     */
     public static string ReadTextFile (String sfilename, String sencode)
    {
         StringBuffer sbstr = new StringBuffer ();

        try
        {
             file FF = new file (sfilename);
            InputStreamReader Read = new InputStreamReader (new FileInputStream (FF),
                     Sencode);
            BufferedReader ins = new BufferedReader (read );

            String dataline = "";
            while (null!= (Dataline = Ins.readline ()))
            {
                 sbstr.append (dataline);
                sbstr.append ("\ r \ n ");
           }

Ins.close ();
}
catch (Exception e)
{
Logman.error ("Read Text File error", E);
}

return sbstr.tostring ();
}

   /**
     * To remove the left and right space after the string is empty
     * @param astr String
     * @return Boolean
     */
    Public Static Boolean Istrimempty (String astr)
    {
        if ( NULL = = Astr) | | (astr.length () = 0))
        {
             return true;
       }
        if (IsBlank (Astr.trim ())
         {
            return true;
       }
        return false;
   }

/**
* Whether the string is empty: null or length 0.
* @param astr source string.
* @return Boolean
*/
public static Boolean IsBlank (String astr)
{
if (null = ASTR) | | (astr.length () = 0))
{
return true;
}
Else
{
return false;
}
}

}



Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.