Package nekohtml; import Java. io. ioexception; import javax. XML. transform. transformerexception; import Org. apache. XPath. xpathapi; import org.cyberneko.html. parsers. domparser; import Org. w3C. dom. document; import Org. w3C. dom. nodelist; import Org. XML. sax. saxexception; public class nekohtmlandxpath {// resolve the corresponding HTML to the DOM document public static document getdocument (string URL) {domparser parser = new domparser (); try {Parser. parse (URL);} catch (saxexception e) {e. printstacktrace ();} catch (ioexception e) {e. printstacktrace ();} document DOC = parser. getdocument (); Return Doc;} // use XPath to locate the specific node public static nodelist getexactnode (document DOC, string XP) {nodelist list = NULL; try {list = xpathapi. selectnodelist (Doc, XP);} catch (transformerexception e) {e. printstacktrace ();} return list;} public static void main (strin G [] ARGs) {string Baidu = "http://www.baidu.com /"; string bpath = "// html // body // Div [2] // P // map // area"; string yyt = "http://www.yinyuetai.com/mv/all? Page = 1 "; string ypath = "/html/body/Div [6]/Div [2]/Div [2]/Div [3]/ul/Li/Div [2]/ h3/"; // resolve the specified page to the DOM document DOC = getdocument (yyt); // obtain the specified node nodelist list = getexactnode (Doc, ypath) based on XPath; system. out. println ("the number of nodes meeting the condition is:" + list. getlength (); For (INT I = 0; I <list. getlength (); I ++) {system. out. println ("the obtained node attribute is:" + list. item (I ). getattributes (). getnameditem ("title "));}}}
Jar package used: xalan. jar; xercesimpl. jar; nekohtml. jar.
The XPath for obtaining tags uses firebug, which sometimes requires some conversion. Because firebug converts HTML into a standard DOM, there may be very few small issues with XPath. For example, the tbody tag may be added. Just make a slight change.