Java web crawler crawl Baidu News

Source: Internet
Author: User


With Commons-httpclient

Commons-httpclient is a legacy version that has now been deprecated by the authorities.


Lucene with version 4.3

Required JAR Packages

650) this.width=650; "src=" Http://s2.51cto.com/wyfs02/M02/7D/FB/wKiom1b0E3uz0lXjAAAyFb6bkVY732.png "title=" capture. PNG "alt=" Wkiom1b0e3uz0lxjaaayfb6bkvy732.png "/>


Package com.lulei.util;import java.io.bufferedreader;import java.io.bytearrayinputstream;import  java.io.File;import java.io.IOException;import java.io.InputStream;import  java.io.inputstreamreader;import java.util.hashmap;import java.util.iterator;import  java.util.map;import java.util.regex.matcher;import java.util.regex.pattern;import  org.apache.commons.httpclient.header;import org.apache.commons.httpclient.httpclient;import  org.apache.commons.httpclient.httpexception;import org.apache.commons.httpclient.httpmethod;import  Org.apache.commons.httpclient.httpstatus;import org.apache.commons.httpclient.methods.getmethod;import  org.apache.log4j.Logger;import org.apache.lucene.analysis.Analyzer;import  Org.apache.lucene.analysis.standard.standardanalyzer;import org.apache.lucene.document.document;import  org.apache.lucene.index.directoryreader;import org.apache.lucene.index.indexreader;import org.apache.lucene.index.indexwriter;import  org.apache.lucene.index.indexwriterconfig;import org.apache.lucene.index.indexwriterconfig.openmode; import org.apache.lucene.index.term;import org.apache.lucene.search.indexsearcher;import  Org.apache.lucene.search.nrtmanager;import org.apache.lucene.search.nrtmanager.trackingindexwriter;import  org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import  org.apache.lucene.search.searcherfactory;import org.apache.lucene.search.termquery;import  org.apache.lucene.search.topdocs;import org.apache.lucene.store.directory;import  org.apache.lucene.store.niofsdirectory;import org.apache.lucene.util.version;import org.junit.test; Public class mycrawl {    private static int maxconnecttimes  = 3;    private static httpclient httpclient = neW httpclient ();    private static logger log =  Logger.getlogger (mycrawl.class);     private static header[] responseheaders  = null;    private static String pageSourceCode =  "";     //  page Default encoding method     private static String  charsetname =  "iso-8859-1";    //  regular matching needs to see the source of the Web page, firebug see not      //  crawler + Build index     public static void main (String[] args)  {        String urlSeed =  "http://news.baidu.com/ N?cmd=4&class=sportnews&pn=1&from=tab ";        hashmap< String, string> params = new hashmap<string, string> ();         params.Put ("Referer",  "http://www.baidu.com");         params.put (                  "User-agent",                  "mozilla/5.0  ( WINDOWS&NBSP;NT&NBSP;6.1;&NBSP;WOW64)  AppleWebKit/537.36  (Khtml, like gecko)  chrome/ 36.0.1985.125 safari/537.36 ");         getmethod getmethod  = new getmethod (urlseed);         iterator iter  = params.entryset (). Iterator ();        while  ( Iter.hasnext ())  {            Map.Entry  entry =  (Map.entry)  iter.next ();             string key =  (String)  entry.getkey ();             string val =  (String)  entry.getvalue ();             getmethod.setrequestheader (Key, val);         }        //  get page Source to pagesourcecode variable          try {             readpage (getmethod,  "Utf-8",  urlseed);        }  catch  (exception e)  {             E.printstacktrace ();        }         system.out.println (Pagesourcecode);        string  regexstr =  "& #8226; &lT;a href=\ "(. *?) \ "";         pattern pattern = pattern.compile (REGEXSTR,  Pattern.CASE_INSENSITIVE                 | pattern.dotall);         matcher matcher  = pattern.matcher (Pagesourcecode);         int count  = 0;        while  (Matcher.find ())  {             system.out.println (Matcher.group ());             system.out.println (Matcher.group (1));             system.out.println (Matcher.groupCount ());             count++;       &nBsp; }        system.out.println (count);     }     private static boolean readpage (httpmethod method, string  defaultCharset,            String  URLSTR)  throws HttpException, IOException {         int n = maxConnectTimes;        while  (n  > 0)  {            try {                 if  ( Httpclient.executemethod (method) &NBSP;!=&NBSP;HTTPSTATUS.SC_OK)  {                     log.error ("Can not  connect  " + urlstr +  "\ T"                              +  (maxconnecttimes - n  + 1)  +  "\ T"                              +  Httpclient.executemethod (method));                     n--;                 } else {                     //  Get header information                       Responseheaders = method.getresponseheadERS ();                     //  get page source code                      InputStream inputStream =  Method.getresponsebodyasstream ();                     BufferedReader bufferedReader = new  BufferedReader (                             new inputstreamreader ( Inputstream, charsetname));                     stringbuffer stringbuffer = new stringbuffer ();                     String lineString = null;                      while  ((Linestring = bufferedreader.readline ())  != null)  {                          stringbuffer.append (lineString);                         stringbuffer.append ( "\ n");                     }                     pagesourcecode = stringbuffer.tostring ();                     InputStream in = new  Bytearrayinputstream (                             pagesourcecode.getbytes ( CharsetName));                     string charset = charsetutil.getstreamcharset (in,                              defaultcharset);                     //  The following judgment is for IP attribution query deliberately added                       if  ("Big5". Equals (charsET))  {                         charset =  "GBK";                     }                     if  (! Charsetname.tolowercase ()                              .equals ( Charset.tolowercase ()))  {                         pagesourcecode = new string (                                  pagesourcecode.getbytes (CharsetName),  charset);                     }                      return true;                 }            } catch  ( Exception e)  {                 e.printstacktrace ();                 system.out.println (urlstr +  " -- can ' t connect  "                           +  (maxconnecttimes - n + 1));                 n--;             }        }         return false;    }    //  Real-time search     @ Test    public void search ()  {         analyzer analyzer = new standardanalyzer (version.lucene_43);         indexwriterconfig indexwriterconfig = new indexwriterconfig (  &NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;VERSION.LUCENE_43,  analyzer);         indexwriterconfig.setopenmode (OpenMode.CREATE_ Or_append);        String indexFile =  "D:/index/knnik";         Directory directory = null;         try {            directory =  niofsdirectory.open (New file (indexfile));             //  Create an index              Indexwriter indexwriter = new indexwriter (directory,                     indexwriterconfig);             trackingindexwriter trackingindexwriter  = new trackingindexwriter (                 &nbsP;   indexwriter);             Nrtmanager nrtmanager = new nrtmanager (trackingindexwriter,                     new  Searcherfactory ());            //  Query index              IndexSearcher indexSearch =  Nrtmanager.acquire ();            /*              * //general method of acquiring Indexsearch, non-real-time   Indexreader             * indexreader= Directoryreader.open (directory);              *      &nbSp;       * indexsearcher indexsearch=new indexsearcher ( Indexreader);             */             term term = new term ("Content" ,  "we");             query query =  new termquery (term);             topdocs  topdocs = indexsearch.search (query, 10);             system.out.println ("--------Total Query results------");             int totalHits = topDocs.totalHits;             system.out.println ("Totalhits"  +  ":"  + totalhits );             for  (scoredoc scoredoc :  Topdocs.scoredocs)  {                 // scoredoc.doc Gets the docid                 int docId = scoreDoc.doc;                 system.out.println ("DocId:"  + docId);                 Document  Document = indexsearch.doc (docId);                 system.out.println (Document.get ("id"));                 system.out.println (Document.get ("title"));         &nBsp;       system.out.println (Document.get ("content"));                 system.out.println (Document.get ( "url"));            }             nrtmanager.release (Indexsearch);             nrtmanager.close ();         }  catch  (ioexception e)  {             // TODO Auto-generated catch block             e.printstacktrace ();        }     }}


Code GitHub managed Address: Https://github.com/quantmod/JavaCrawl/blob/master/src/com/lulei/util/MyCrawl.java

Reference article:

http://blog.csdn.net/xiaojimanman/article/details/40891791


This article is from the "bit accumulation" blog, please be sure to keep this source http://tianxingzhe.blog.51cto.com/3390077/1755054

Java web crawler crawl Baidu News

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.