With Commons-httpclient
Commons-httpclient is a legacy version that has now been deprecated by the authorities.
Lucene with version 4.3
Required JAR Packages
650) this.width=650; "src=" Http://s2.51cto.com/wyfs02/M02/7D/FB/wKiom1b0E3uz0lXjAAAyFb6bkVY732.png "title=" capture. PNG "alt=" Wkiom1b0e3uz0lxjaaayfb6bkvy732.png "/>
Package com.lulei.util;import java.io.bufferedreader;import java.io.bytearrayinputstream;import java.io.File;import java.io.IOException;import java.io.InputStream;import java.io.inputstreamreader;import java.util.hashmap;import java.util.iterator;import java.util.map;import java.util.regex.matcher;import java.util.regex.pattern;import org.apache.commons.httpclient.header;import org.apache.commons.httpclient.httpclient;import org.apache.commons.httpclient.httpexception;import org.apache.commons.httpclient.httpmethod;import Org.apache.commons.httpclient.httpstatus;import org.apache.commons.httpclient.methods.getmethod;import org.apache.log4j.Logger;import org.apache.lucene.analysis.Analyzer;import Org.apache.lucene.analysis.standard.standardanalyzer;import org.apache.lucene.document.document;import org.apache.lucene.index.directoryreader;import org.apache.lucene.index.indexreader;import org.apache.lucene.index.indexwriter;import org.apache.lucene.index.indexwriterconfig;import org.apache.lucene.index.indexwriterconfig.openmode; import org.apache.lucene.index.term;import org.apache.lucene.search.indexsearcher;import Org.apache.lucene.search.nrtmanager;import org.apache.lucene.search.nrtmanager.trackingindexwriter;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.searcherfactory;import org.apache.lucene.search.termquery;import org.apache.lucene.search.topdocs;import org.apache.lucene.store.directory;import org.apache.lucene.store.niofsdirectory;import org.apache.lucene.util.version;import org.junit.test; Public class mycrawl { private static int maxconnecttimes = 3; private static httpclient httpclient = neW httpclient (); private static logger log = Logger.getlogger (mycrawl.class); private static header[] responseheaders = null; private static String pageSourceCode = ""; // page Default encoding method private static String charsetname = "iso-8859-1"; // regular matching needs to see the source of the Web page, firebug see not // crawler + Build index public static void main (String[] args) { String urlSeed = "http://news.baidu.com/ N?cmd=4&class=sportnews&pn=1&from=tab "; hashmap< String, string> params = new hashmap<string, string> (); params.Put ("Referer", "http://www.baidu.com"); params.put ( "User-agent", "mozilla/5.0 ( WINDOWS&NBSP;NT&NBSP;6.1;&NBSP;WOW64) AppleWebKit/537.36 (Khtml, like gecko) chrome/ 36.0.1985.125 safari/537.36 "); getmethod getmethod = new getmethod (urlseed); iterator iter = params.entryset (). Iterator (); while ( Iter.hasnext ()) { Map.Entry entry = (Map.entry) iter.next (); string key = (String) entry.getkey (); string val = (String) entry.getvalue (); getmethod.setrequestheader (Key, val); } // get page Source to pagesourcecode variable try { readpage (getmethod, "Utf-8", urlseed); } catch (exception e) { E.printstacktrace (); } system.out.println (Pagesourcecode); string regexstr = "& #8226; &lT;a href=\ "(. *?) \ ""; pattern pattern = pattern.compile (REGEXSTR, Pattern.CASE_INSENSITIVE | pattern.dotall); matcher matcher = pattern.matcher (Pagesourcecode); int count = 0; while (Matcher.find ()) { system.out.println (Matcher.group ()); system.out.println (Matcher.group (1)); system.out.println (Matcher.groupCount ()); count++; &nBsp; } system.out.println (count); } private static boolean readpage (httpmethod method, string defaultCharset, String URLSTR) throws HttpException, IOException { int n = maxConnectTimes; while (n > 0) { try { if ( Httpclient.executemethod (method) &NBSP;!=&NBSP;HTTPSTATUS.SC_OK) { log.error ("Can not connect " + urlstr + "\ T" + (maxconnecttimes - n + 1) + "\ T" + Httpclient.executemethod (method)); n--; } else { // Get header information Responseheaders = method.getresponseheadERS (); // get page source code InputStream inputStream = Method.getresponsebodyasstream (); BufferedReader bufferedReader = new BufferedReader ( new inputstreamreader ( Inputstream, charsetname)); stringbuffer stringbuffer = new stringbuffer (); String lineString = null; while ((Linestring = bufferedreader.readline ()) != null) { stringbuffer.append (lineString); stringbuffer.append ( "\ n"); } pagesourcecode = stringbuffer.tostring (); InputStream in = new Bytearrayinputstream ( pagesourcecode.getbytes ( CharsetName)); string charset = charsetutil.getstreamcharset (in, defaultcharset); // The following judgment is for IP attribution query deliberately added if ("Big5". Equals (charsET)) { charset = "GBK"; } if (! Charsetname.tolowercase () .equals ( Charset.tolowercase ())) { pagesourcecode = new string ( pagesourcecode.getbytes (CharsetName), charset); } return true; } } catch ( Exception e) { e.printstacktrace (); system.out.println (urlstr + " -- can ' t connect " + (maxconnecttimes - n + 1)); n--; } } return false; } // Real-time search @ Test public void search () { analyzer analyzer = new standardanalyzer (version.lucene_43); indexwriterconfig indexwriterconfig = new indexwriterconfig ( &NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;VERSION.LUCENE_43, analyzer); indexwriterconfig.setopenmode (OpenMode.CREATE_ Or_append); String indexFile = "D:/index/knnik"; Directory directory = null; try { directory = niofsdirectory.open (New file (indexfile)); // Create an index Indexwriter indexwriter = new indexwriter (directory, indexwriterconfig); trackingindexwriter trackingindexwriter = new trackingindexwriter ( &nbsP; indexwriter); Nrtmanager nrtmanager = new nrtmanager (trackingindexwriter, new Searcherfactory ()); // Query index IndexSearcher indexSearch = Nrtmanager.acquire (); /* * //general method of acquiring Indexsearch, non-real-time Indexreader * indexreader= Directoryreader.open (directory); * &nbSp; * indexsearcher indexsearch=new indexsearcher ( Indexreader); */ term term = new term ("Content" , "we"); query query = new termquery (term); topdocs topdocs = indexsearch.search (query, 10); system.out.println ("--------Total Query results------"); int totalHits = topDocs.totalHits; system.out.println ("Totalhits" + ":" + totalhits ); for (scoredoc scoredoc : Topdocs.scoredocs) { // scoredoc.doc Gets the docid int docId = scoreDoc.doc; system.out.println ("DocId:" + docId); Document Document = indexsearch.doc (docId); system.out.println (Document.get ("id")); system.out.println (Document.get ("title")); &nBsp; system.out.println (Document.get ("content")); system.out.println (Document.get ( "url")); } nrtmanager.release (Indexsearch); nrtmanager.close (); } catch (ioexception e) { // TODO Auto-generated catch block e.printstacktrace (); } }}
Code GitHub managed Address: Https://github.com/quantmod/JavaCrawl/blob/master/src/com/lulei/util/MyCrawl.java
Reference article:
http://blog.csdn.net/xiaojimanman/article/details/40891791
This article is from the "bit accumulation" blog, please be sure to keep this source http://tianxingzhe.blog.51cto.com/3390077/1755054
Java web crawler crawl Baidu News