Recently in a mobile app, crawling news through a news crawler and then displaying the news through the app. After discovering that the phone side does not support the style label, if the page has a style tag, then the contents of the label will show that processing very affect the appearance of the page. So I wrote a tool class that uses nekohtml to clear the style tag.
Html.filter.properties configuration files, configuring allowed tags and attributes to be removed in tags and tags
Attributes=style,id,name,class,width,height,src,oldsrc,complete,align,alt,titleaccepttags=div,span,a,li,ul,nav , Br,p,img,font,b,strong,table,tr,tdremovetags=style
Propertiesutils Read Properties
Package Com.tiamaes.gjds.util;import Java.io.ioexception;import Java.util.properties;import org.springframework.core.io.classpathresource;/** * <p> Description: Read attributes in Properties </p> * <p> Created by: Wang Cheng </p> * <p> created: January 28, 2015 morning 11:23:27 </p> * <p> Copyright Note: ©2015 tiamaes </p> * /public class Propertiesutils {private Properties properties;public propertiesutils (String path) {try { Classpathresource resource = new Classpathresource (path);p roperties = new properties ();p roperties.load ( Resource.getinputstream ());} catch (IOException e) {e.printstacktrace ();}} public string get (string key) {return This.properties.getProperty (key);}}
Filter tags in html
Package Com.tiamaes.gjds.util;import Java.io.chararrayreader;import Java.io.stringwriter;import Java.util.arraylist;import Java.util.list;import Org.apache.xerces.xni.parser.xmldocumentfilter;import Org.cyberneko.html.filters.elementremover;import Org.cyberneko.html.filters.writer;import Org.cyberneko.html.parsers.domparser;import org.xml.sax.inputsource;/** * <p> class Description: Filter tags in html </p> * <p > Creator: Wang Cheng </p> * <p> creation time: January 29, 2015 10:45:02 </p> * <p> Copyright Note: ©2015 tiamaes </p> */pub Lic class Htmlfilterutils {private static propertiesutils properties = Null;private static Htmlfilterutils filter = Null;p Rivate string configpath = "Html.filter.properties";p rivate static final String Attribute_field = "Attributes";p rivate St Atic final String Accept_tags_field = "Accepttags";p rivate static final String Remove_tags_field = "Removetags";p rivate Li st<string> attributes = new arraylist<string> ();p rivate list<string> accepttags = new ARraylist<string> ();p rivate list<string> removetags = new arraylist<string> ();p rivate static synchronized void Syncinit () {if (filter = = NULL) filter = new Htmlfilterutils ();} public static Htmlfilterutils getinstance () {return getinstance (false);} public static Htmlfilterutils getinstance (Boolean createNew) {if (CreateNew) return new Htmlfilterutils (); if (filter = = NULL) {Syncinit ();} return filter;} Private Htmlfilterutils () {if (properties = = null) {properties = new propertiesutils (Configpath);} This.addtolist (attributes, Properties.get (Attribute_field)); This.addtolist (Accepttags, Properties.get (ACCEPT_ Tags_field)); This.addtolist (Removetags, Properties.get (Remove_tags_field));} public void Addatributes (String attrname) {this.attributes.add (attrname);} public void Removeatributes (String attrname) {this.attributes.remove (attrname);} public void Addrmovetag (String tagName) {this.removeTags.add (tagName);} public void Removermovetag (String tagName) {this.removeTags.remove (tagName);} public void ADdaccepttag (String tagName) {this.acceptTags.add (tagName);} public void Removeaccepttag (String tagName) {this.acceptTags.remove (tagName);} private void Addtolist (list<string> list,string sources) {if (list = = null) List = new arraylist<string> (); string[] Sourcesarray = Sources.split (","); for (String Str:sourcesarray) {list.add (str);}} public string DoFilter (string htmlcode) {Elementremover remover = new Elementremover (); string[] Atrrs = new String[attributes.size ()];for (String tag:accepttags) remover.acceptelement (tag, Attributes.toarray (Atrrs)); for (String tag:removetags) remover.removeelement (tag); CharArrayReader reader = null; String result;try {StringWriter filtereddescription = new StringWriter (); Writer writer = new writer (filtereddescription, "UTF-8"); Xmldocumentfilter[] filters = {Remover,writer};D omparser parser = new Domparser (); reader = new CharArrayReader ( Htmlcode.tochararray ()); InputSource InputSource = new InputSource (reader);p arser.setproperty ("http://cyberneko.org/html/pRoperties/filters ", filters);p arser.parse (inputsource); result = Filtereddescription.tostring ();} catch (Exception E1) {e1.printstacktrace (); result = Htmlcode;} try {reader.close ();} catch (Exception e) {e.printstacktrace ();} return result;}}
Call Dofilter to filter the contents of HTML
Use nekohtml to process Web pages (delete style labels)