Java Web page captures the title and body, java captures the title body

Last Update:2014-07-10 Source: Internet

Author: User

Tags getscript

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Java Web page captures the title and body, java captures the title body

Import java. io. bufferedReader; import java. io. IOException; import java. io. inputStreamReader; import java.net. malformedURLException; import java.net. URL; import java. util. arrayList; import java. util. hashMap; import java. util. list; import java. util. regex. matcher; import java. util. regex. pattern; public class WebContent {/*** read all content of a webpage */public String getOneHtml (final String htmlurl) throws IOException {URL Url; String temp; final StringBuffer sb = new StringBuffer (); try {url = new URL (htmlurl); final BufferedReader in = new BufferedReader (new InputStreamReader (url. openStream (), "UTF-8"); // read all content of the webpage while (temp = in. readLine ())! = Null) {sb. append (temp);} in. close ();} catch (final MalformedURLException me) {System. out. println ("the URL format you entered is incorrect! Enter "); me. getMessage (); throw me;} catch (final IOException e) {e. printStackTrace (); throw e;} return sb. toString ();}/***** @ param s * @ return obtain the webpage title */public String getTitle (final String s) {String regex; String title = ""; final List <String> list = new ArrayList <String> (); regex = "<title>. *? </Title> "; final Pattern pa = Pattern. compile (regex, Pattern. CANON_EQ); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group () ;}for (int I = 0; I <list. size (); I ++) {title = title + list. get (I) ;}return outTag (title) ;}/ ***** @ param s * @ return get link */public List <String> getLink (final String s) {String regex; final List <String> list = new ArrayList <String> (); regex = "< A [^>] * href = (\ "([^ \"] *) \ "| \ '([^ \'] *) \ '| ([^ \ s>] *) [^>] *> (. *?) </A> "; final Pattern pa = Pattern. compile (regex, Pattern. DOTALL); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group ();} return list;}/***** @ param s * @ return get script code */public List <String> getScript (final String s) {String regex; final List <String> list = new ArrayList <String> (); regex = "<script. *? </Script> "; final Pattern pa = Pattern. compile (regex, Pattern. DOTALL); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group ();} return list;}/***** @ param s * @ return get CSS */public List <String> getCSS (final String s) {String regex; final List <String> list = new ArrayList <String> (); regex = "<style. *? </Style> "; final Pattern pa = Pattern. compile (regex, Pattern. DOTALL); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group ();} return list;}/***** @ param s * @ return remove mark */public String outTag (final String s) {return s. replaceAll ("<. *?> "," ");}/***** @ Param s * @ return get Yahoo knowledge Hall article title and content */public HashMap <String, string> getFromYahoo (final String s) {final HashMap <String, String> hm = new HashMap <String, String> (); final StringBuffer sb = new StringBuffer (); string html = ""; System. out. println ("\ n ---------------- start to read the webpage (" + s + ") ------------------"); try {html = getOneHtml (s);} catch (final Exception e) {e. getMessage ();} // System. out. println (html); System. out. println ("------------------ reading a webpage (" + s + ") ends ------------------ \ n"); System. out. println ("------------------ analysis (" + s + ") Result: ------------------ \ n"); String title = outTag (getTitle (html); title = title. replaceAll ("_ Yahoo knowledge Hall", ""); // Pattern pa = Pattern. compile ("<div // class = \" original \ "> (. *?) (\ R \ n )*)(.*?) (\ R \ n )*)(.*?) </Div> ", Pattern. DOTALL); final Pattern pa = Pattern. compile (" <div class = \ "original \"> (.*?) </P> </div> ", Pattern. DOTALL); final Matcher ma = pa. matcher (html); while (ma. find () {sb. append (ma. group ();} String temp = sb. toString (); temp = temp. replaceAll ("(<br>) +? "," \ N "); // convert the line feed temp = temp. replaceAll (" <p> <em> .*? </Em> </p> "," "); // comment hm. put ("title", title); hm. put ("original", outTag (temp); return hm ;}/ ***** @ param args * tests a set of webpages, for Yahoo knowledge hall */public static void main (final String args []) {String url = ""; final List <String> list = new ArrayList <String> (); system. out. print ("input URL, one line, input end after go program start running: \ n");/** http://ks.cn.yahoo.com/question/1307121201133.html * http://ks.cn.yahoo.com/question/130712 1101907. html * http://ks.cn.yahoo.com/question/1307121101907_2.html * http://ks.cn.yahoo.com/question/1307121101907_3.html * http://ks.cn.yahoo.com/question/1307121101907_4.html * http://ks.cn.yahoo.com/question/1307121101907_5.html * http://ks.cn.yahoo.com/question/1307121101907_6.html * http://ks.cn.yahoo.com/question/1307121101907_7.html * http://ks.cn.yahoo.com/question/1307121101907_8.html */ Final BufferedReader br = new BufferedReader (new InputStreamReader (System. in); try {while (! (Url = br. readLine ()). equals ("go") {list. add (url) ;}} catch (final Exception e) {e. getMessage () ;}final WebContent wc = new WebContent (); HashMap <String, String> hm = new HashMap <String, String> (); for (int I = 0; I <list. size (); I ++) {hm = wc. getFromYahoo (list. get (I); System. out. println ("title:" + hm. get ("title"); System. out. println ("content: \ n" + hm. get ("original");}/** String htmlurl [] = {* "Http://ks.cn.yahoo.com/question/1307121201133.html", * "http://ks.cn.yahoo.com/question/1307121101907.html", * "http://ks.cn.yahoo.com/question/1307121101907_2.html", * "http://ks.cn.yahoo.com/question/1307121101907_3.html", * "http://ks.cn.yahoo.com/question/1307121101907_4.html", * "http://ks.cn.yahoo.com/question/1307121101907_5.html", * "http://ks.cn.yahoo.com/question/1307121101907_6.htm ", * "" L ", *" http://ks.cn.yahoo.com/question/1307121101907_7.html ", *" http://ks.cn.yahoo.com/question/1307121101907_8.html "}; WebContent * wc = new WebContent (); HashMap <String, String> hm = new HashMap <String, * String> (); for (int I = 0; I 
　　

Java Web Page Information Capture
First, you can get the content of the page. Then, analyze the title and date formats of several news articles to find out what is common, and then write a regular expression to filter the content, I have done a project to capture all the content of a website and generate static files.
If the formats of multiple types of news are different, you need to define multiple templates and then repeat them until they are matched.

Capture page titles and coding methods using java
If you want a process, let's proceed.
If you only need to capture the page title and encoding, you do not need to load the resolution. You can select dynamic resolution,
1. Get the Document Stream. According to the characteristics of the html document, the head information is almost included when 1 kb is read at a time.
2. traverse the node to obtain the title and meta nodes, and retrieve the text
3. The two are finished.

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More