Import Org.htmlparser.nodefilter;import Org.htmlparser.parser;import Org.htmlparser.beans.stringbean;import Org.htmlparser.filters.cssselectornodefilter;import Org.htmlparser.util.nodelist;public class HtmlUtil {public static string GetText (string html, string id) {try {Parser Parser = new Parser (HTML); Nodefilter filter = new Cssselectornodefilter ("#" + ID); NodeList nList = Parser.extractallnodesthatmatch (filter); return nList = = NULL | | Nlist.size () = = 0? Null:nList.elementAt (0). toplaintextstring ();} catch (Exception e) {e.printstacktrace (); return null;}} public static string Gettextbyclass (string html, string css_class) {try {Parser Parser = new Parser (HTML); Nodefilter filter = new Cssselectornodefilter ("." + Css_class); NodeList nList = Parser.extractallnodesthatmatch (filter); return nList = = NULL | | Nlist.size () = = 0? Null:nList.elementAt (0). toplaintextstring ();} catch (Exception e) {e.printstacktrace (); return null;}} public static string FilterText (string text) {if (text = = null)return null;text = Text.replace (">", ">"), Text = Text.replace ("<", "<"), Text = Text.replace ("" "," \ ""); text = Text.replace (" ", ""); text = Text.replace ("&", "&"); text = Text.replace ("©", "©"); text = Text.rep Lace ("", ""); return text;} /** * Get plain text information in Web pages * * @param HTML * @param ID * @return * @throws Exception * @throws Exception */public static String get Text (String html) throws Exception {Stringbean bean = new Stringbean (); Bean.setlinks (false); Bean.setreplacenonbreakingspaces (True); Bean.setcollapse (true);//Returns the parsed page plain text information parser parser = Parser.createparser ( HTML, "Utf-8");p Arser.visitallnodeswith (Bean);p arser.reset (); return bean.getstrings ();}}
Need to use the Htmlparse.jar library, called the following way:
Htmlutil.gettext (HTMLSTR);
HTML extract text information version-java (for Lucene indexing)