1 Packagecom.peidon.html;2 3 ImportJava.io.BufferedReader;4 ImportJava.io.File;5 ImportJava.io.FileOutputStream;6 Importjava.io.IOException;7 ImportJava.io.InputStream;8 ImportJava.io.InputStreamReader;9 Importjava.net.HttpURLConnection;Ten ImportJava.net.URL; One Importjava.net.URLConnection; A - ImportOrg.jsoup.Jsoup; - Importorg.jsoup.nodes.Document; the Importorg.jsoup.nodes.Element; - Importorg.jsoup.select.Elements; - - + /** - * @authorSunshine + * @version1.0 A * @date: August 15, 2015 morning 9:01:13 at * @description: Java Development search engine crawler - * Jsoup Similar to the powerful features of jquery, what facilitates parsing operations in the HTML DOM tree - * Associated JAR package Jsoup-1.8.3.jar - */ - Public classHttpsoup { - in Public Static voidMain (string[] args) { - to //get the source code of the Web page based on the URL and page encoding set +String Htmlresource = Gethtmlresourcebyurl ("http://www.ui.cn/", "UTF-8"); - //System.out.println (htmlresource); the * //parsing source Code $Document document =Jsoup.parse (htmlresource);Panax Notoginseng - //get a picture of a webpage the //Web page picture label +Elements Elements = Document.getelementsbytag ("img"); A the for(Element element:elements) { +String imgsrc = element.attr ("src"); - //System.out.println (IMGSRC); $Downimages (IMGSRC, "d:\\test\\images\\"); $System.out.println ("Download succeeded:" +imgsrc); - //System.out.println (imgsrc.substring (Imgsrc.lastindexof ("/") )); - } the } - Wuyi /** the * According to the URL of an image, bulk download images to the server's disk via this URL - * @paramIMAGEURL the server address to download Wu * @paramFilePath image address saved to server after download is complete - * About */ $ Public Static voiddownimages (String imageUrl, String filePath) { -String fileName = imageurl.substring (Imageurl.lastindexof ("/")); - - Try { A //Create a directory of files +File files =NewFile (filePath); the //determine if a file exists - if(!files.exists ()) { $ files.mkdirs (); the } the //gets the image file's theURL url =NewURL (IMAGEURL); the //Connect network picture address -HttpURLConnection UC =(HttpURLConnection) url.openconnection (); in //gets the output stream of the connection theInputStream is =Uc.getinputstream (); the About //Create a file theFile File =NewFile (FilePath +fileName); the //Create an output stream, write a file theFileOutputStream out =Newfileoutputstream (file); + inti = 0; - while((i = Is.read ())! = 1){ the Out.write (i);Bayi } the is.close (); the out.close (); -}Catch(Exception e) { - e.printstacktrace (); the } the } the the /** - * Get the source code of the Web page based on the URL and page encoding set the * @paramURL the * @paramencoding the * @return94 */ the Public Staticstring gethtmlresourcebyurl (string URL, string encoding) { the the //declaring a container that stores Web page source code98StringBuffer buff =NewStringBuffer (); About -URL urlobj =NULL;101URLConnection UC =NULL;102InputStreamReader in =NULL;103BufferedReader reader =NULL;104 Try { the //Establish network links106Urlobj =Newurl (url);107 108 //Open Network link connection109UC =urlobj.openconnection (); the 111 //Create a network input stream thein =NewInputStreamReader (Uc.getinputstream (), encoding);113 the //buffered writes to a file stream theReader =NewBufferedReader (in); the 117String Templine =NULL;118 119 //looping through a file stream - while((Templine = Reader.readline ())! =NULL){121Buff.append (templine + "\ n");//Loop Append Data122 }123 124 the 126}Catch(Exception e) {127 - e.printstacktrace ();129System.out.println ("conection timeout ..."); the}finally {131 if(In! =NULL){ the Try {133 in.close ();134}Catch(IOException e) {135 e.printstacktrace ();136 }137 }138 139 } $ 141 returnbuff.tostring ();142 }143}
Java Development Search Engine crawler