PrivateWebClient getawebclient () {WebClient WebClient=NewWebClient (browserversion.firefox_24); Webclient.getoptions (). SetTimeout (20000); //Webclient.getcookiemanager (). Setcookiesenabled (true);Webclient.getoptions (). Setthrowexceptiononfailingstatuscode (false); Webclient.getoptions (). Setthrowexceptiononscripterror (false); Webclient.getoptions (). setcssenabled (false); Webclient.getoptions (). setjavascriptenabled (false); Webclient.addrequestheader ("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); Webclient.addrequestheader ("Accept-encoding", "gzip, deflate"); Webclient.addrequestheader ("Accept-language", "en-us,en;q=0.5"); Webclient.addrequestheader ("Cache-control", "max-age=0"); Webclient.addrequestheader ("Connection", "keep-alive"); Webclient.addrequestheader ("Host", "www.amazon.com"); Webclient.addrequestheader ("User-agent", "mozilla/5.0 (X11; Linux x86_64; rv:24.0) gecko/20100101 firefox/24.0 "); returnwebClient; }
/*** Collect Web pages*/ PublicStringBuilder crawlpage (String url) {StringBuilder builder=NewStringBuilder (); Logger.info (Thread.CurrentThread (). GetName ()+ "Crawl" +URL); //Here's the Mygetpage code .Webclient.getcookiemanager (). Clearcookies (); Logger.info (Thread.CurrentThread (). GetName ()+ "Webclient.getcookiemanager (). Clearcookies ();"); File File=NewFile (Cookiepathappendrandom ()); Logger.info (Thread.CurrentThread (). GetName ()+ "File File = new file (Cookiepathappendrandom ());"); if(File.exists ()) {FileInputStream fin=NULL; Try{fin=Newfileinputstream (file); } Catch(FileNotFoundException E1) {e1.printstacktrace (); } cookiestore Cookiestore=NULL; ObjectInputStream in; Try{ in=NewObjectInputStream (Fin); Cookiestore=(Cookiestore) in.readobject (); In.close (); } Catch(IOException e) {logger.error (e); } Catch(ClassNotFoundException e) {logger.error (e); } List<org.apache.http.cookie.Cookie> L =cookiestore.getcookies (); for(Org.apache.http.cookie.Cookie temp:l) {Cookie Cookie=NewCookie (Temp.getdomain (), Temp.getname (), Temp.getvalue (), Temp.getpath (), Temp.getexpirydate () , false); Webclient.getcookiemanager (). Addcookie (cookie); }} logger.info (Thread.CurrentThread (). GetName ()+ "Mygetpage start,url:" +URL); HtmlPage Page= Mygetpage (Newstringbuffer (URL)); Logger.info (Thread.CurrentThread (). GetName ()+ "Mygetpage end,url:" +URL); if(page = =NULL) { //The model that appears in the acquisition process can be uniformly placed in a list and sent to the server to rejoin the collection allocation queueLogger.info ("Page null!"); Amazoncrawlmodel Model=NewAmazoncrawlmodel (crawlid, Crawlurlid, URL, depth,ischange); Exceptionfun (model); return(NewStringBuilder ("Getnullpage")); } logger.info (Thread.CurrentThread (). GetName ()+ "Builder.append (Page.asxml ());"); Builder.append (Page.asxml ()); Logger.info (Thread.CurrentThread (). GetName ()+ "return builder;"); Logger.info (Thread.CurrentThread (). GetName ()+ "Crawlpage $Length =" +builder.tostring (). Length ()); if(Builder.tostring (). Length () <=300) {Amazoncrawlmodel model=NewAmazoncrawlmodel (crawlid, Crawlurlid, URL, depth,ischange); Exceptionfun (model); return(NewStringBuilder ("Getnullpage")); } returnBuilder; }
/*** * Custom GetPage, encountered verification code page recognition until successful **/ Privatehtmlpage mygetpage (StringBuffer URL) {htmlpage page=NULL; BooleanFlag =true; inttrytimecnt = 1; intunknowhosttrytimecnt = 1; while(flag) {flag=false; Try{logger.info (Thread.CurrentThread (). GetName ()+ "Webclient.getpage:" + URL + ", crawlurl_id:" +crawlurlid); Page=Webclient.getpage (url.tostring ()); Document Doc=Jsoup.parse (Page.asxml ()); intRobotchecknum = 1; while(Doc.select ("title"). Text (). Equals ("Robot Check") {logger.info (Thread.CurrentThread (). GetName ()+ " " +Dayformat1.format (System.currenttimemillis ())+ "[Robot check,url:" + URL + "]"); String Captcha_str= Amazongetcaptcha.getcaptcha (NewStringBuilder (doc.tostring ())); Logger.info (Thread.CurrentThread (). GetName ()+ " " +Dayformat1.format (System.currenttimemillis ())+ "End Amazongetcaptcha.getcaptcha"); Logger.info (Dayformat1.format (NewDate ()) + "" + Thread.CurrentThread (). GetName () + ":" +captcha_str); HtmlForm form=NULL; Logger.info (Thread.CurrentThread (). GetName ()+ "page.getforms (). Get (0) Start"); Form= Page.getforms (). Get (0); Logger.info (Thread.CurrentThread (). GetName ()+ "page.getforms (). Get (0) End"); HtmlButton Button=NULL; Logger.info (Thread.CurrentThread (). GetName ()+ "Form.getelementsbytagname" (Button). Get (0) Start "); Button= (HtmlButton) form.getelementsbytagname ("button"). Get (0); Logger.info (Thread.CurrentThread (). GetName ()+ "Form.getelementsbytagname" (Button). Get (0) End "); Logger.info (Thread.CurrentThread (). GetName ()+ "Setvalueattribute Start"); Form.getinputbyname ("Field-keywords"). Setvalueattribute (CAPTCHA_STR); Logger.info (Thread.CurrentThread (). GetName ()+ "Setvalueattribute End"); Logger.info (Thread.CurrentThread (). GetName ()+ "Button.Click Start"); BooleanClick_flag =false; while(!Click_flag) { Try{Click_flag=true; Page=Button.Click (); } Catch(Exception E1) {Logger.error (Thread.CurrentThread (). GetName ()+ "Button.Click Error:" +E1); //e1.printstacktrace ();Click_flag =false; }} logger.info (Thread.CurrentThread (). GetName ()+ "Button.Click End"); while(Page.asxml () = =NULL) {Logger.info (Thread.CurrentThread (). GetName ()+ "page XML null"); Logger.info (Thread.CurrentThread (). GetName ()+" "+page.asxml ()); Page.refresh (); Logger.info (Thread.CurrentThread (). GetName ()+ "Refresh end!"); } logger.info (Thread.CurrentThread (). GetName ()+ "Button.Click End"); Logger.info (Thread.CurrentThread (). GetName ()+ "Start parsepage!"); Doc=Jsoup.parse (Page.asxml ()); if(!doc.select ("title"). Text (). Equals ("Robot Check") {logger.info (Thread.CurrentThread (). GetName ()+ "" + doc.select ("title")). text ()); Logger.info (Thread.CurrentThread (). GetName ()+ "" + Dayformat1.format (System.currenttimemillis ()) + "[Robot Check,captcha success:" + Captcha_str + ", try Num:" + robotchecknum + "]"); } robotchecknum++; } } Catch(failinghttpstatuscodeexception e) {logger.error (Thread.CurrentThread (). GetName ()+" "+e); Flag=true; } Catch(malformedurlexception e) {logger.error (Thread.CurrentThread (). GetName ()+" "+e); Flag=true; }Catch(unknownhostexception e) {logger.error (Thread.CurrentThread (). GetName ()+" "+e); Flag=true; Logger.info ("Found Unknownhostexception,start sleep min"); Try{Thread.Sleep (1000*60*integer.parseint (Configuration.getproperties ("Unknowhost_sleeptime"))); } Catch(interruptedexception E1) {Logger.error (Thread.CurrentThread (). GetName ()+" "+E1); } logger.info ("Found Unknownhostexception,end sleep min"); Unknowhosttrytimecnt++;//Access exception number plus oneLogger.info (Thread.CurrentThread (). GetName () + "" +Dayformat1.format (System.currenttimemillis ())+ "[unknowhosttrytimecnt:" + unknowhosttrytimecnt + "]"); if(Unknowhosttrytimecnt > Integer.parseint (configuration.getproperties ("Unknowhost_maxtrytime"))) { return NULL; } }Catch(Exception eq) {logger.error (Thread.CurrentThread (). GetName ()+ " "+eq); Trytimecnt++;//Access exception number plus oneLogger.info (Thread.CurrentThread (). GetName () + "" +Dayformat1.format (System.currenttimemillis ())+ "[trytimecnt:" + trytimecnt + "]"); if(Trytimecnt > 5) { return NULL; } Try{Thread.Sleep (1000); } Catch(interruptedexception e) {e.printstacktrace (); Logger.error (Thread.CurrentThread (). GetName ()+e); } Flag=true; } Try{thread.sleep (Random.nextint (500) + 1500); } Catch(interruptedexception e) {logger.error (Thread.CurrentThread (). GetName ()+e); Flag=true; } } returnpage; }
Java WebClient Summary