Java Web page captures the title and body, java captures the title body
Import java. io. bufferedReader; import java. io. IOException; import java. io. inputStreamReader; import java.net. malformedURLException; import java.net. URL; import java. util. arrayList; import java. util. hashMap; import java. util. list; import java. util. regex. matcher; import java. util. regex. pattern; public class WebContent {/*** read all content of a webpage */public String getOneHtml (final String htmlurl) throws IOException {URL Url; String temp; final StringBuffer sb = new StringBuffer (); try {url = new URL (htmlurl); final BufferedReader in = new BufferedReader (new InputStreamReader (url. openStream (), "UTF-8"); // read all content of the webpage while (temp = in. readLine ())! = Null) {sb. append (temp);} in. close ();} catch (final MalformedURLException me) {System. out. println ("the URL format you entered is incorrect! Enter "); me. getMessage (); throw me;} catch (final IOException e) {e. printStackTrace (); throw e;} return sb. toString ();}/***** @ param s * @ return obtain the webpage title */public String getTitle (final String s) {String regex; String title = ""; final List <String> list = new ArrayList <String> (); regex = "<title>. *? </Title> "; final Pattern pa = Pattern. compile (regex, Pattern. CANON_EQ); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group () ;}for (int I = 0; I <list. size (); I ++) {title = title + list. get (I) ;}return outTag (title) ;}/ ***** @ param s * @ return get link */public List <String> getLink (final String s) {String regex; final List <String> list = new ArrayList <String> (); regex = "< A [^>] * href = (\ "([^ \"] *) \ "| \ '([^ \'] *) \ '| ([^ \ s>] *) [^>] *> (. *?) </A> "; final Pattern pa = Pattern. compile (regex, Pattern. DOTALL); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group ();} return list;}/***** @ param s * @ return get script code */public List <String> getScript (final String s) {String regex; final List <String> list = new ArrayList <String> (); regex = "<script. *? </Script> "; final Pattern pa = Pattern. compile (regex, Pattern. DOTALL); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group ();} return list;}/***** @ param s * @ return get CSS */public List <String> getCSS (final String s) {String regex; final List <String> list = new ArrayList <String> (); regex = "<style. *? </Style> "; final Pattern pa = Pattern. compile (regex, Pattern. DOTALL); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group ();} return list;}/***** @ param s * @ return remove mark */public String outTag (final String s) {return s. replaceAll ("<. *?> "," ");}/***** @ Param s * @ return get Yahoo knowledge Hall article title and content */public HashMap <String, string> getFromYahoo (final String s) {final HashMap <String, String> hm = new HashMap <String, String> (); final StringBuffer sb = new StringBuffer (); string html = ""; System. out. println ("\ n ---------------- start to read the webpage (" + s + ") ------------------"); try {html = getOneHtml (s);} catch (final Exception e) {e. getMessage ();} // System. out. println (html); System. out. println ("------------------ reading a webpage (" + s + ") ends ------------------ \ n"); System. out. println ("------------------ analysis (" + s + ") Result: ------------------ \ n"); String title = outTag (getTitle (html); title = title. replaceAll ("_ Yahoo knowledge Hall", ""); // Pattern pa = Pattern. compile ("<div // class = \" original \ "> (. *?) (\ R \ n )*)(.*?) (\ R \ n )*)(.*?) </Div> ", Pattern. DOTALL); final Pattern pa = Pattern. compile (" <div class = \ "original \"> (.*?) </P> </div> ", Pattern. DOTALL); final Matcher ma = pa. matcher (html); while (ma. find () {sb. append (ma. group ();} String temp = sb. toString (); temp = temp. replaceAll ("(<br>) +? "," \ N "); // convert the line feed temp = temp. replaceAll (" <p> <em> .*? </Em> </p> "," "); // comment hm. put ("title", title); hm. put ("original", outTag (temp); return hm ;}/ ***** @ param args * tests a set of webpages, for Yahoo knowledge hall */public static void main (final String args []) {String url = ""; final List <String> list = new ArrayList <String> (); system. out. print ("input URL, one line, input end after go program start running: \ n");/** http://ks.cn.yahoo.com/question/1307121201133.html * http://ks.cn.yahoo.com/question/130712 1101907. html * http://ks.cn.yahoo.com/question/1307121101907_2.html * http://ks.cn.yahoo.com/question/1307121101907_3.html * http://ks.cn.yahoo.com/question/1307121101907_4.html * http://ks.cn.yahoo.com/question/1307121101907_5.html * http://ks.cn.yahoo.com/question/1307121101907_6.html * http://ks.cn.yahoo.com/question/1307121101907_7.html * http://ks.cn.yahoo.com/question/1307121101907_8.html */ Final BufferedReader br = new BufferedReader (new InputStreamReader (System. in); try {while (! (Url = br. readLine ()). equals ("go") {list. add (url) ;}} catch (final Exception e) {e. getMessage () ;}final WebContent wc = new WebContent (); HashMap <String, String> hm = new HashMap <String, String> (); for (int I = 0; I <list. size (); I ++) {hm = wc. getFromYahoo (list. get (I); System. out. println ("title:" + hm. get ("title"); System. out. println ("content: \ n" + hm. get ("original");}/** String htmlurl [] = {* "Http://ks.cn.yahoo.com/question/1307121201133.html", * "http://ks.cn.yahoo.com/question/1307121101907.html", * "http://ks.cn.yahoo.com/question/1307121101907_2.html", * "http://ks.cn.yahoo.com/question/1307121101907_3.html", * "http://ks.cn.yahoo.com/question/1307121101907_4.html", * "http://ks.cn.yahoo.com/question/1307121101907_5.html", * "http://ks.cn.yahoo.com/question/1307121101907_6.htm ", * "" L ", *" http://ks.cn.yahoo.com/question/1307121101907_7.html ", *" http://ks.cn.yahoo.com/question/1307121101907_8.html "}; WebContent * wc = new WebContent (); HashMap <String, String> hm = new HashMap <String, * String> (); for (int I = 0; I
Java Web Page Information Capture
First, you can get the content of the page. Then, analyze the title and date formats of several news articles to find out what is common, and then write a regular expression to filter the content, I have done a project to capture all the content of a website and generate static files.
If the formats of multiple types of news are different, you need to define multiple templates and then repeat them until they are matched.
Capture page titles and coding methods using java
If you want a process, let's proceed.
If you only need to capture the page title and encoding, you do not need to load the resolution. You can select dynamic resolution,
1. Get the Document Stream. According to the characteristics of the html document, the head information is almost included when 1 kb is read at a time.
2. traverse the node to obtain the title and meta nodes, and retrieve the text
3. The two are finished.