Java WebClient Summary

Source: Internet
Author: User

PrivateWebClient getawebclient () {WebClient WebClient=NewWebClient (browserversion.firefox_24); Webclient.getoptions (). SetTimeout (20000); //Webclient.getcookiemanager (). Setcookiesenabled (true);Webclient.getoptions (). Setthrowexceptiononfailingstatuscode (false); Webclient.getoptions (). Setthrowexceptiononscripterror (false); Webclient.getoptions (). setcssenabled (false); Webclient.getoptions (). setjavascriptenabled (false); Webclient.addrequestheader ("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); Webclient.addrequestheader ("Accept-encoding", "gzip, deflate"); Webclient.addrequestheader ("Accept-language", "en-us,en;q=0.5"); Webclient.addrequestheader ("Cache-control", "max-age=0"); Webclient.addrequestheader ("Connection", "keep-alive"); Webclient.addrequestheader ("Host", "www.amazon.com"); Webclient.addrequestheader ("User-agent", "mozilla/5.0 (X11; Linux x86_64; rv:24.0) gecko/20100101 firefox/24.0 "); returnwebClient; }
/*** Collect Web pages*/     PublicStringBuilder crawlpage (String url) {StringBuilder builder=NewStringBuilder (); Logger.info (Thread.CurrentThread (). GetName ()+ "Crawl" +URL); //Here's the Mygetpage code .Webclient.getcookiemanager (). Clearcookies (); Logger.info (Thread.CurrentThread (). GetName ()+ "Webclient.getcookiemanager (). Clearcookies ();"); File File=NewFile (Cookiepathappendrandom ()); Logger.info (Thread.CurrentThread (). GetName ()+ "File File = new file (Cookiepathappendrandom ());"); if(File.exists ()) {FileInputStream fin=NULL; Try{fin=Newfileinputstream (file); } Catch(FileNotFoundException E1) {e1.printstacktrace (); } cookiestore Cookiestore=NULL;            ObjectInputStream in; Try{ in=NewObjectInputStream (Fin); Cookiestore=(Cookiestore) in.readobject ();            In.close (); } Catch(IOException e) {logger.error (e); } Catch(ClassNotFoundException e) {logger.error (e); } List<org.apache.http.cookie.Cookie> L =cookiestore.getcookies ();  for(Org.apache.http.cookie.Cookie temp:l) {Cookie Cookie=NewCookie (Temp.getdomain (), Temp.getname (), Temp.getvalue (), Temp.getpath (), Temp.getexpirydate () , false);            Webclient.getcookiemanager (). Addcookie (cookie); }} logger.info (Thread.CurrentThread (). GetName ()+ "Mygetpage start,url:" +URL); HtmlPage Page= Mygetpage (Newstringbuffer (URL)); Logger.info (Thread.CurrentThread (). GetName ()+ "Mygetpage end,url:" +URL); if(page = =NULL) {            //The model that appears in the acquisition process can be uniformly placed in a list and sent to the server to rejoin the collection allocation queueLogger.info ("Page null!"); Amazoncrawlmodel Model=NewAmazoncrawlmodel (crawlid, Crawlurlid, URL, depth,ischange);            Exceptionfun (model); return(NewStringBuilder ("Getnullpage")); } logger.info (Thread.CurrentThread (). GetName ()+ "Builder.append (Page.asxml ());");        Builder.append (Page.asxml ()); Logger.info (Thread.CurrentThread (). GetName ()+ "return builder;"); Logger.info (Thread.CurrentThread (). GetName ()+ "Crawlpage $Length =" +builder.tostring (). Length ()); if(Builder.tostring (). Length () <=300) {Amazoncrawlmodel model=NewAmazoncrawlmodel (crawlid, Crawlurlid, URL, depth,ischange);            Exceptionfun (model); return(NewStringBuilder ("Getnullpage")); }        returnBuilder; }

/*** * Custom GetPage, encountered verification code page recognition until successful **/    Privatehtmlpage mygetpage (StringBuffer URL) {htmlpage page=NULL; BooleanFlag =true; inttrytimecnt = 1; intunknowhosttrytimecnt = 1;  while(flag) {flag=false; Try{logger.info (Thread.CurrentThread (). GetName ()+ "Webclient.getpage:" + URL + ", crawlurl_id:" +crawlurlid); Page=Webclient.getpage (url.tostring ()); Document Doc=Jsoup.parse (Page.asxml ()); intRobotchecknum = 1;  while(Doc.select ("title"). Text (). Equals ("Robot Check") {logger.info (Thread.CurrentThread (). GetName ()+ " " +Dayformat1.format (System.currenttimemillis ())+ "[Robot check,url:" + URL + "]"); String Captcha_str= Amazongetcaptcha.getcaptcha (NewStringBuilder (doc.tostring ())); Logger.info (Thread.CurrentThread (). GetName ()+ " " +Dayformat1.format (System.currenttimemillis ())+ "End Amazongetcaptcha.getcaptcha"); Logger.info (Dayformat1.format (NewDate ()) + "" + Thread.CurrentThread (). GetName () + ":" +captcha_str); HtmlForm form=NULL; Logger.info (Thread.CurrentThread (). GetName ()+ "page.getforms (). Get (0) Start"); Form= Page.getforms (). Get (0); Logger.info (Thread.CurrentThread (). GetName ()+ "page.getforms (). Get (0) End"); HtmlButton Button=NULL; Logger.info (Thread.CurrentThread (). GetName ()+ "Form.getelementsbytagname" (Button). Get (0) Start "); Button= (HtmlButton) form.getelementsbytagname ("button"). Get (0); Logger.info (Thread.CurrentThread (). GetName ()+ "Form.getelementsbytagname" (Button). Get (0) End "); Logger.info (Thread.CurrentThread (). GetName ()+ "Setvalueattribute Start"); Form.getinputbyname ("Field-keywords"). Setvalueattribute (CAPTCHA_STR); Logger.info (Thread.CurrentThread (). GetName ()+ "Setvalueattribute End"); Logger.info (Thread.CurrentThread (). GetName ()+ "Button.Click Start"); BooleanClick_flag =false;  while(!Click_flag) {                        Try{Click_flag=true; Page=Button.Click (); } Catch(Exception E1) {Logger.error (Thread.CurrentThread (). GetName ()+ "Button.Click Error:" +E1); //e1.printstacktrace ();Click_flag =false; }} logger.info (Thread.CurrentThread (). GetName ()+ "Button.Click End");  while(Page.asxml () = =NULL) {Logger.info (Thread.CurrentThread (). GetName ()+ "page XML null"); Logger.info (Thread.CurrentThread (). GetName ()+" "+page.asxml ());                        Page.refresh (); Logger.info (Thread.CurrentThread (). GetName ()+ "Refresh end!"); } logger.info (Thread.CurrentThread (). GetName ()+ "Button.Click End"); Logger.info (Thread.CurrentThread (). GetName ()+ "Start parsepage!"); Doc=Jsoup.parse (Page.asxml ()); if(!doc.select ("title"). Text (). Equals ("Robot Check") {logger.info (Thread.CurrentThread (). GetName ()+ "" + doc.select ("title")). text ()); Logger.info (Thread.CurrentThread (). GetName ()+ "" + Dayformat1.format (System.currenttimemillis ()) + "[Robot Check,captcha success:" + Captcha_str + ", try Num:" + robotchecknum + "]"); } robotchecknum++; }            } Catch(failinghttpstatuscodeexception e) {logger.error (Thread.CurrentThread (). GetName ()+" "+e); Flag=true; } Catch(malformedurlexception e) {logger.error (Thread.CurrentThread (). GetName ()+" "+e); Flag=true; }Catch(unknownhostexception e) {logger.error (Thread.CurrentThread (). GetName ()+" "+e); Flag=true; Logger.info ("Found Unknownhostexception,start sleep min"); Try{Thread.Sleep (1000*60*integer.parseint (Configuration.getproperties ("Unknowhost_sleeptime"))); } Catch(interruptedexception E1) {Logger.error (Thread.CurrentThread (). GetName ()+" "+E1); } logger.info ("Found Unknownhostexception,end sleep min"); Unknowhosttrytimecnt++;//Access exception number plus oneLogger.info (Thread.CurrentThread (). GetName () + "" +Dayformat1.format (System.currenttimemillis ())+ "[unknowhosttrytimecnt:" + unknowhosttrytimecnt + "]"); if(Unknowhosttrytimecnt > Integer.parseint (configuration.getproperties ("Unknowhost_maxtrytime"))) {                    return NULL; }            }Catch(Exception eq) {logger.error (Thread.CurrentThread (). GetName ()+ " "+eq); Trytimecnt++;//Access exception number plus oneLogger.info (Thread.CurrentThread (). GetName () + "" +Dayformat1.format (System.currenttimemillis ())+ "[trytimecnt:" + trytimecnt + "]"); if(Trytimecnt > 5) {                    return NULL; }                Try{Thread.Sleep (1000); } Catch(interruptedexception e) {e.printstacktrace (); Logger.error (Thread.CurrentThread (). GetName ()+e); } Flag=true; }            Try{thread.sleep (Random.nextint (500) + 1500); } Catch(interruptedexception e) {logger.error (Thread.CurrentThread (). GetName ()+e); Flag=true; }        }        returnpage; }

Java WebClient Summary

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.