Directly on the code
ImportJava.io.BufferedReader; Importjava.io.IOException; ImportJava.io.InputStreamReader; Importjava.net.MalformedURLException; ImportJava.net.URL; Importjava.util.ArrayList; ImportJava.util.HashMap; Importjava.util.List; ImportJava.util.regex.Matcher; ImportJava.util.regex.Pattern; Public classtesthtml {/*** Read all the contents of a webpage*/ PublicString getonehtml (FinalString Htmlurl)throwsioexception {URL url; String temp; FinalStringBuffer SB =NewStringBuffer (); Try{URL=NewURL (Htmlurl); FinalBufferedReader in =NewBufferedReader (NewInputStreamReader (Url.openstream (), "GBK"));//Read all the contents of a webpage while(temp = In.readline ())! =NULL) {sb.append (temp); } in.close (); } Catch(Finalmalformedurlexception me) {System.out.println ("There is a problem with the URL format you entered!" Please carefully enter "); Me.getmessage (); Throwme; } Catch(FinalIOException E) {E.printstacktrace (); Throwe; } returnsb.tostring (); } /** * * @paramS *@returnGet page title*/ PublicString GetTitle (FinalString s) {String regex; String title= ""; Finallist<string> list =NewArraylist<string>(); Regex= "<title>.*?</title>"; FinalPattern PA =pattern.compile (regex, pattern.canon_eq); FinalMatcher ma =Pa.matcher (s); while(Ma.find ()) {List.add (Ma.group ()); } for(inti = 0; I < list.size (); i++) {title= title +List.get (i); } returnOuttag (title); }/*** Get Parameters *@paramS *@paramRegexarg *@return */ PublicString Getbyregex (FinalString s,string Regexarg) {String regex; String title= ""; Finallist<string> list =NewArraylist<string>(); Regex=Regexarg; FinalPattern PA =pattern.compile (regex, pattern.canon_eq); FinalMatcher ma =Pa.matcher (s); while(Ma.find ()) {List.add (Ma.group ()); } for(inti = 0; I < list.size (); i++) {title= title +List.get (i); } returnOuttag (title); }/** * * @paramargs **/ Public Static voidMainFinalString args[]) {String URL= "HTTP://DETAIL.1688.COM/OFFER/41797007099.HTML?TRACELOG=P4P"; Try{String HTML= ""; testhtml testhtml=Newtesthtml (); HTML=testhtml.getonehtml (URL); String Regex= "<span class=\" value price-length-5\ ">.*?</span>"; String content=Testhtml.getbyregex (HTML, Regex); System.out.println ("Contet is:" +content); } Catch(FinalException e) {e.getmessage (); } }}
Java Crawl Web content