Java Downloads Web pages and reads content

Source: Internet
Author: User
Tags xslt

How can I read the downloaded content:

package com.core.crawl;import java.io.IOException;import com.util.file.Files;public class Crawl {    /**     * @param args     * @throws IOException      * @throws InterruptedException      */    public static void main(String[] args) throws IOException, InterruptedException {long begin = System.currentTimeMillis();//WebSpider spider2 = new WebSpider();WebSpider spider1 = new WebSpider();spider1.setWebAddress("http://www.w3c.org/robots.txt");spider1.setDestFile(Files.getSysPath() + "/"+"robots.");//spider2.setWebAddress("http://blog.csdn.net/longronglin");//spider2.setDestFile(Files.getSysPath() + "/"+"spider2.");Thread t1 = new Thread(spider1);//Thread t2 = new Thread(spider2);t1.start();//t2.start();t1.join();//t2.join();System.out.println("the end");System.out.println(System.currentTimeMillis() - begin);    }    }
 
package com.core.crawl;import java.io.BufferedReader;import java.io.DataInputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;import com.core.http.Http;public class WebSpider implements Runnable{        private Http http = new Http();    private String webAddress = "";    private String destFile = "";        public void setWebAddress(String webAddress){this.webAddress = webAddress;    }        public void setDestFile (String destFile){this.destFile = destFile;    }        public boolean download() throws IOException, InterruptedException {HttpURLConnection httpConn = null;try {    URL url = new URL(webAddress);      httpConn = (HttpURLConnection) url.openConnection();    httpConn.setRequestMethod("GET");    httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14");    InputStream in = httpConn.getInputStream();    String fileType = http.fileType(httpConn.getContentType());    System.out.println(fileType);    FileOutputStream out = new FileOutputStream(new File(destFile + fileType));    int chByte = in.read();    BufferedReader bf = new BufferedReader(new InputStreamReader(in));    String result = null;    while ((result = bf.readLine()) != null) {System.out.println(result);    }//            while (chByte != -1) {//out.write(chByte);////System.out.println(chByte);//chByte = in.read();//    }            } catch (Exception ex) {    System.out.println(ex.toString());} finally {    httpConn.disconnect();}return true;    }    public void run() {try {    System.out.println(Thread.currentThread().getName());    download();    } catch (IOException e) {    e.printStackTrace();} catch (InterruptedException e) {    e.printStackTrace();}    }}

 

Package COM. util. file; public class files {/***** get the root directory of the application * @ return application root directory */public static string getsyspath () {return system. getproperty ("user. dir ");}}
 
results:

Thread-0
Html

 

# Robots.txt for http://www.w3.org/
#
# $ ID: robots.txt, V 1.50 17:09:37 Ted exp $
#

 

# For use by search.w3.org
User-Agent: W3C-gsa
Disallow:/out-of-date

 

User-Agent: w3t_se
Disallow:/out-of-date

 

User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT; MS search 4.0 robot)
Disallow :/

 

# W3C link checker
User-Agent: W3C-checklink
Disallow:

 

# Exclude some access-controlled areas
User-Agent :*
Disallow:/2004/ontaria/basic
Disallow:/team
Disallow:/Project
Disallow:/Web
Disallow:/Systems
Disallow:/History
Disallow:/out-of-date
Disallow:/2002/02/Mid
Disallow:/Mid/
Disallow:/2004/08/w3ctalks
Disallow:/2007/11/talks/search
Disallow:/people/All/
Disallow:/RDF/validator/arpservlet
Disallow:/2003/03/translations/bylanguage
Disallow:/2003/03/translations/bytechnology
Disallow:/2005/11/translations/Query
Disallow:/2003/glossary/subglossary/
# Disallow:/2005/06/blog/
# Disallow:/2001/07/pubrules-checker
# Shouldnt get transparent proxies but will ml links of things like pubrules
Disallow:/2000/06/webdata/XSLT
Disallow:/2000/09/webdata/XSLT
Disallow:/2005/08/online_xslt/XSLT
Disallow:/bugs/
Disallow:/search/mail/public/
Disallow:/2006/02/chartergen
The end
10485

 

Spider1.setwebaddress ("http://www.w3c.org /");
Spider1.setdestfile (files. getsyspath () + "/" + "W3C .");
Settings test by yourself

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.