Java web crawler crawl Baidu News

Last Update:2016-03-26 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

With Commons-httpclient

Commons-httpclient is a legacy version that has now been deprecated by the authorities.

Lucene with version 4.3

Required JAR Packages

650) this.width=650; "src=" Http://s2.51cto.com/wyfs02/M02/7D/FB/wKiom1b0E3uz0lXjAAAyFb6bkVY732.png "title=" capture. PNG "alt=" Wkiom1b0e3uz0lxjaaayfb6bkvy732.png "/>

Package com.lulei.util;import java.io.bufferedreader;import java.io.bytearrayinputstream;import  java.io.File;import java.io.IOException;import java.io.InputStream;import  java.io.inputstreamreader;import java.util.hashmap;import java.util.iterator;import  java.util.map;import java.util.regex.matcher;import java.util.regex.pattern;import  org.apache.commons.httpclient.header;import org.apache.commons.httpclient.httpclient;import  org.apache.commons.httpclient.httpexception;import org.apache.commons.httpclient.httpmethod;import  Org.apache.commons.httpclient.httpstatus;import org.apache.commons.httpclient.methods.getmethod;import  org.apache.log4j.Logger;import org.apache.lucene.analysis.Analyzer;import  Org.apache.lucene.analysis.standard.standardanalyzer;import org.apache.lucene.document.document;import  org.apache.lucene.index.directoryreader;import org.apache.lucene.index.indexreader;import org.apache.lucene.index.indexwriter;import  org.apache.lucene.index.indexwriterconfig;import org.apache.lucene.index.indexwriterconfig.openmode; import org.apache.lucene.index.term;import org.apache.lucene.search.indexsearcher;import  Org.apache.lucene.search.nrtmanager;import org.apache.lucene.search.nrtmanager.trackingindexwriter;import  org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import  org.apache.lucene.search.searcherfactory;import org.apache.lucene.search.termquery;import  org.apache.lucene.search.topdocs;import org.apache.lucene.store.directory;import  org.apache.lucene.store.niofsdirectory;import org.apache.lucene.util.version;import org.junit.test; Public class mycrawl {    private static int maxconnecttimes  = 3;    private static httpclient httpclient = neW httpclient ();    private static logger log =  Logger.getlogger (mycrawl.class);     private static header[] responseheaders  = null;    private static String pageSourceCode =  "";     //  page Default encoding method     private static String  charsetname =  "iso-8859-1";    //  regular matching needs to see the source of the Web page, firebug see not      //  crawler + Build index     public static void main (String[] args)  {        String urlSeed =  "http://news.baidu.com/ N?cmd=4&class=sportnews&pn=1&from=tab ";        hashmap< String, string> params = new hashmap<string, string> ();         params.Put ("Referer",  "http://www.baidu.com");         params.put (                  "User-agent",                  "mozilla/5.0  ( WINDOWS&NBSP;NT&NBSP;6.1;&NBSP;WOW64)  AppleWebKit/537.36  (Khtml, like gecko)  chrome/ 36.0.1985.125 safari/537.36 ");         getmethod getmethod  = new getmethod (urlseed);         iterator iter  = params.entryset (). Iterator ();        while  ( Iter.hasnext ())  {            Map.Entry  entry =  (Map.entry)  iter.next ();             string key =  (String)  entry.getkey ();             string val =  (String)  entry.getvalue ();             getmethod.setrequestheader (Key, val);         }        //  get page Source to pagesourcecode variable          try {             readpage (getmethod,  "Utf-8",  urlseed);        }  catch  (exception e)  {             E.printstacktrace ();        }         system.out.println (Pagesourcecode);        string  regexstr =  "& #8226; &lT;a href=\ "(. *?) \ "";         pattern pattern = pattern.compile (REGEXSTR,  Pattern.CASE_INSENSITIVE                 | pattern.dotall);         matcher matcher  = pattern.matcher (Pagesourcecode);         int count  = 0;        while  (Matcher.find ())  {             system.out.println (Matcher.group ());             system.out.println (Matcher.group (1));             system.out.println (Matcher.groupCount ());             count++;       &nBsp; }        system.out.println (count);     }     private static boolean readpage (httpmethod method, string  defaultCharset,            String  URLSTR)  throws HttpException, IOException {         int n = maxConnectTimes;        while  (n  > 0)  {            try {                 if  ( Httpclient.executemethod (method) &NBSP;!=&NBSP;HTTPSTATUS.SC_OK)  {                     log.error ("Can not  connect  " + urlstr +  "\ T"                              +  (maxconnecttimes - n  + 1)  +  "\ T"                              +  Httpclient.executemethod (method));                     n--;                 } else {                     //  Get header information                       Responseheaders = method.getresponseheadERS ();                     //  get page source code                      InputStream inputStream =  Method.getresponsebodyasstream ();                     BufferedReader bufferedReader = new  BufferedReader (                             new inputstreamreader ( Inputstream, charsetname));                     stringbuffer stringbuffer = new stringbuffer ();                     String lineString = null;                      while  ((Linestring = bufferedreader.readline ())  != null)  {                          stringbuffer.append (lineString);                         stringbuffer.append ( "\ n");                     }                     pagesourcecode = stringbuffer.tostring ();                     InputStream in = new  Bytearrayinputstream (                             pagesourcecode.getbytes ( CharsetName));                     string charset = charsetutil.getstreamcharset (in,                              defaultcharset);                     //  The following judgment is for IP attribution query deliberately added                       if  ("Big5". Equals (charsET))  {                         charset =  "GBK";                     }                     if  (! Charsetname.tolowercase ()                              .equals ( Charset.tolowercase ()))  {                         pagesourcecode = new string (                                  pagesourcecode.getbytes (CharsetName),  charset);                     }                      return true;                 }            } catch  ( Exception e)  {                 e.printstacktrace ();                 system.out.println (urlstr +  " -- can ' t connect  "                           +  (maxconnecttimes - n + 1));                 n--;             }        }         return false;    }    //  Real-time search     @ Test    public void search ()  {         analyzer analyzer = new standardanalyzer (version.lucene_43);         indexwriterconfig indexwriterconfig = new indexwriterconfig (  &NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;VERSION.LUCENE_43,  analyzer);         indexwriterconfig.setopenmode (OpenMode.CREATE_ Or_append);        String indexFile =  "D:/index/knnik";         Directory directory = null;         try {            directory =  niofsdirectory.open (New file (indexfile));             //  Create an index              Indexwriter indexwriter = new indexwriter (directory,                     indexwriterconfig);             trackingindexwriter trackingindexwriter  = new trackingindexwriter (                 &nbsP;   indexwriter);             Nrtmanager nrtmanager = new nrtmanager (trackingindexwriter,                     new  Searcherfactory ());            //  Query index              IndexSearcher indexSearch =  Nrtmanager.acquire ();            /*              * //general method of acquiring Indexsearch, non-real-time   Indexreader             * indexreader= Directoryreader.open (directory);              *      &nbSp;       * indexsearcher indexsearch=new indexsearcher ( Indexreader);             */             term term = new term ("Content" ,  "we");             query query =  new termquery (term);             topdocs  topdocs = indexsearch.search (query, 10);             system.out.println ("--------Total Query results------");             int totalHits = topDocs.totalHits;             system.out.println ("Totalhits"  +  ":"  + totalhits );             for  (scoredoc scoredoc :  Topdocs.scoredocs)  {                 // scoredoc.doc Gets the docid                 int docId = scoreDoc.doc;                 system.out.println ("DocId:"  + docId);                 Document  Document = indexsearch.doc (docId);                 system.out.println (Document.get ("id"));                 system.out.println (Document.get ("title"));         &nBsp;       system.out.println (Document.get ("content"));                 system.out.println (Document.get ( "url"));            }             nrtmanager.release (Indexsearch);             nrtmanager.close ();         }  catch  (ioexception e)  {             // TODO Auto-generated catch block             e.printstacktrace ();        }     }}

Code GitHub managed Address: Https://github.com/quantmod/JavaCrawl/blob/master/src/com/lulei/util/MyCrawl.java

Reference article:

http://blog.csdn.net/xiaojimanman/article/details/40891791

This article is from the "bit accumulation" blog, please be sure to keep this source http://tianxingzhe.blog.51cto.com/3390077/1755054

Java web crawler crawl Baidu News

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Java web crawler crawl Baidu News

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Java web crawler crawl Baidu News

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support