Java Web page captures the title and body, java captures the title body

Source: Internet
Author: User
Tags getscript

Java Web page captures the title and body, java captures the title body

 

 

 

Import java. io. bufferedReader; import java. io. IOException; import java. io. inputStreamReader; import java.net. malformedURLException; import java.net. URL; import java. util. arrayList; import java. util. hashMap; import java. util. list; import java. util. regex. matcher; import java. util. regex. pattern; public class WebContent {/*** read all content of a webpage */public String getOneHtml (final String htmlurl) throws IOException {URL Url; String temp; final StringBuffer sb = new StringBuffer (); try {url = new URL (htmlurl); final BufferedReader in = new BufferedReader (new InputStreamReader (url. openStream (), "UTF-8"); // read all content of the webpage while (temp = in. readLine ())! = Null) {sb. append (temp);} in. close ();} catch (final MalformedURLException me) {System. out. println ("the URL format you entered is incorrect! Enter "); me. getMessage (); throw me;} catch (final IOException e) {e. printStackTrace (); throw e;} return sb. toString ();}/***** @ param s * @ return obtain the webpage title */public String getTitle (final String s) {String regex; String title = ""; final List <String> list = new ArrayList <String> (); regex = "<title>. *? </Title> "; final Pattern pa = Pattern. compile (regex, Pattern. CANON_EQ); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group () ;}for (int I = 0; I <list. size (); I ++) {title = title + list. get (I) ;}return outTag (title) ;}/ ***** @ param s * @ return get link */public List <String> getLink (final String s) {String regex; final List <String> list = new ArrayList <String> (); regex = "< A [^>] * href = (\ "([^ \"] *) \ "| \ '([^ \'] *) \ '| ([^ \ s>] *) [^>] *> (. *?) </A> "; final Pattern pa = Pattern. compile (regex, Pattern. DOTALL); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group ();} return list;}/***** @ param s * @ return get script code */public List <String> getScript (final String s) {String regex; final List <String> list = new ArrayList <String> (); regex = "<script. *? </Script> "; final Pattern pa = Pattern. compile (regex, Pattern. DOTALL); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group ();} return list;}/***** @ param s * @ return get CSS */public List <String> getCSS (final String s) {String regex; final List <String> list = new ArrayList <String> (); regex = "<style. *? </Style> "; final Pattern pa = Pattern. compile (regex, Pattern. DOTALL); final Matcher ma = pa. matcher (s); while (ma. find () {list. add (ma. group ();} return list;}/***** @ param s * @ return remove mark */public String outTag (final String s) {return s. replaceAll ("<. *?> "," ");}/***** @ Param s * @ return get Yahoo knowledge Hall article title and content */public HashMap <String, string> getFromYahoo (final String s) {final HashMap <String, String> hm = new HashMap <String, String> (); final StringBuffer sb = new StringBuffer (); string html = ""; System. out. println ("\ n ---------------- start to read the webpage (" + s + ") ------------------"); try {html = getOneHtml (s);} catch (final Exception e) {e. getMessage ();} // System. out. println (html); System. out. println ("------------------ reading a webpage (" + s + ") ends ------------------ \ n"); System. out. println ("------------------ analysis (" + s + ") Result: ------------------ \ n"); String title = outTag (getTitle (html); title = title. replaceAll ("_ Yahoo knowledge Hall", ""); // Pattern pa = Pattern. compile ("<div // class = \" original \ "> (. *?) (\ R \ n )*)(.*?) (\ R \ n )*)(.*?) </Div> ", Pattern. DOTALL); final Pattern pa = Pattern. compile (" <div class = \ "original \"> (.*?) </P> </div> ", Pattern. DOTALL); final Matcher ma = pa. matcher (html); while (ma. find () {sb. append (ma. group ();} String temp = sb. toString (); temp = temp. replaceAll ("(<br>) +? "," \ N "); // convert the line feed temp = temp. replaceAll (" <p> <em> .*? </Em> </p> "," "); // comment hm. put ("title", title); hm. put ("original", outTag (temp); return hm ;}/ ***** @ param args * tests a set of webpages, for Yahoo knowledge hall */public static void main (final String args []) {String url = ""; final List <String> list = new ArrayList <String> (); system. out. print ("input URL, one line, input end after go program start running: \ n");/** http://ks.cn.yahoo.com/question/1307121201133.html * http://ks.cn.yahoo.com/question/130712 1101907. html * http://ks.cn.yahoo.com/question/1307121101907_2.html * http://ks.cn.yahoo.com/question/1307121101907_3.html * http://ks.cn.yahoo.com/question/1307121101907_4.html * http://ks.cn.yahoo.com/question/1307121101907_5.html * http://ks.cn.yahoo.com/question/1307121101907_6.html * http://ks.cn.yahoo.com/question/1307121101907_7.html * http://ks.cn.yahoo.com/question/1307121101907_8.html */ Final BufferedReader br = new BufferedReader (new InputStreamReader (System. in); try {while (! (Url = br. readLine ()). equals ("go") {list. add (url) ;}} catch (final Exception e) {e. getMessage () ;}final WebContent wc = new WebContent (); HashMap <String, String> hm = new HashMap <String, String> (); for (int I = 0; I <list. size (); I ++) {hm = wc. getFromYahoo (list. get (I); System. out. println ("title:" + hm. get ("title"); System. out. println ("content: \ n" + hm. get ("original");}/** String htmlurl [] = {* "Http://ks.cn.yahoo.com/question/1307121201133.html", * "http://ks.cn.yahoo.com/question/1307121101907.html", * "http://ks.cn.yahoo.com/question/1307121101907_2.html", * "http://ks.cn.yahoo.com/question/1307121101907_3.html", * "http://ks.cn.yahoo.com/question/1307121101907_4.html", * "http://ks.cn.yahoo.com/question/1307121101907_5.html", * "http://ks.cn.yahoo.com/question/1307121101907_6.htm ", * "" L ", *" http://ks.cn.yahoo.com/question/1307121101907_7.html ", *" http://ks.cn.yahoo.com/question/1307121101907_8.html "}; WebContent * wc = new WebContent (); HashMap <String, String> hm = new HashMap <String, * String> (); for (int I = 0; I 

  


Java Web Page Information Capture

First, you can get the content of the page. Then, analyze the title and date formats of several news articles to find out what is common, and then write a regular expression to filter the content, I have done a project to capture all the content of a website and generate static files.
If the formats of multiple types of news are different, you need to define multiple templates and then repeat them until they are matched.

Capture page titles and coding methods using java

If you want a process, let's proceed.
If you only need to capture the page title and encoding, you do not need to load the resolution. You can select dynamic resolution,
1. Get the Document Stream. According to the characteristics of the html document, the head information is almost included when 1 kb is read at a time.
2. traverse the node to obtain the title and meta nodes, and retrieve the text
3. The two are finished.

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.