Java Development Search Engine crawler

Source: Internet
Author: User

1  Packagecom.peidon.html;2 3 ImportJava.io.BufferedReader;4 ImportJava.io.File;5 ImportJava.io.FileOutputStream;6 Importjava.io.IOException;7 ImportJava.io.InputStream;8 ImportJava.io.InputStreamReader;9 Importjava.net.HttpURLConnection;Ten ImportJava.net.URL; One Importjava.net.URLConnection; A  - ImportOrg.jsoup.Jsoup; - Importorg.jsoup.nodes.Document; the Importorg.jsoup.nodes.Element; - Importorg.jsoup.select.Elements; -  -  + /**  - * @authorSunshine + * @version1.0 A * @date: August 15, 2015 morning 9:01:13 at * @description: Java Development search engine crawler - * Jsoup Similar to the powerful features of jquery, what facilitates parsing operations in the HTML DOM tree - * Associated JAR package Jsoup-1.8.3.jar - */  -  Public classHttpsoup { -  in      Public Static voidMain (string[] args) { -          to         //get the source code of the Web page based on the URL and page encoding set +String Htmlresource = Gethtmlresourcebyurl ("http://www.ui.cn/", "UTF-8"); -         //System.out.println (htmlresource); the          *         //parsing source Code $Document document =Jsoup.parse (htmlresource);Panax Notoginseng          -         //get a picture of a webpage the         //Web page picture label  +Elements Elements = Document.getelementsbytag ("img"); A          the          for(Element element:elements) { +String imgsrc = element.attr ("src"); -             //System.out.println (IMGSRC); $Downimages (IMGSRC, "d:\\test\\images\\"); $System.out.println ("Download succeeded:" +imgsrc); -             //System.out.println (imgsrc.substring (Imgsrc.lastindexof ("/") )); -         } the     } -     Wuyi     /** the * According to the URL of an image, bulk download images to the server's disk via this URL -      * @paramIMAGEURL the server address to download Wu      * @paramFilePath image address saved to server after download is complete -      *  About      */ $      Public Static voiddownimages (String imageUrl, String filePath) { -String fileName = imageurl.substring (Imageurl.lastindexof ("/")); -          -         Try { A             //Create a directory of files +File files =NewFile (filePath); the             //determine if a file exists -             if(!files.exists ()) { $ files.mkdirs (); the             } the             //gets the image file's theURL url =NewURL (IMAGEURL); the             //Connect network picture address -HttpURLConnection UC =(HttpURLConnection) url.openconnection (); in             //gets the output stream of the connection theInputStream is =Uc.getinputstream (); the              About             //Create a file theFile File =NewFile (FilePath +fileName); the             //Create an output stream, write a file theFileOutputStream out =Newfileoutputstream (file); +             inti = 0; -              while((i = Is.read ())! = 1){ the Out.write (i);Bayi             } the is.close (); the out.close (); -}Catch(Exception e) { - e.printstacktrace (); the         }  the     } the      the     /** - * Get the source code of the Web page based on the URL and page encoding set the      * @paramURL the      * @paramencoding the      * @return94      */ the      Public Staticstring gethtmlresourcebyurl (string URL, string encoding) { the          the         //declaring a container that stores Web page source code98StringBuffer buff =NewStringBuffer (); About          -URL urlobj =NULL;101URLConnection UC =NULL;102InputStreamReader in =NULL;103BufferedReader reader =NULL;104         Try { the             //Establish network links106Urlobj =Newurl (url);107             108             //Open Network link connection109UC =urlobj.openconnection (); the             111             //Create a network input stream thein =NewInputStreamReader (Uc.getinputstream (), encoding);113              the             //buffered writes to a file stream theReader =NewBufferedReader (in); the             117String Templine =NULL;118             119             //looping through a file stream -              while((Templine = Reader.readline ())! =NULL){121Buff.append (templine + "\ n");//Loop Append Data122             }123             124              the             126}Catch(Exception e) {127              - e.printstacktrace ();129System.out.println ("conection timeout ..."); the}finally {131             if(In! =NULL){ the                 Try {133 in.close ();134}Catch(IOException e) {135 e.printstacktrace ();136                 }137             }138             139         } $         141         returnbuff.tostring ();142     }143}

Java Development Search Engine crawler

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.