Java Downloads Web pages and reads content

Last Update:2018-12-03 Source: Internet

Author: User

Tags xslt

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

How can I read the downloaded content:

package com.core.crawl;import java.io.IOException;import com.util.file.Files;public class Crawl {    /**     * @param args     * @throws IOException      * @throws InterruptedException      */    public static void main(String[] args) throws IOException, InterruptedException {long begin = System.currentTimeMillis();//WebSpider spider2 = new WebSpider();WebSpider spider1 = new WebSpider();spider1.setWebAddress("http://www.w3c.org/robots.txt");spider1.setDestFile(Files.getSysPath() + "/"+"robots.");//spider2.setWebAddress("http://blog.csdn.net/longronglin");//spider2.setDestFile(Files.getSysPath() + "/"+"spider2.");Thread t1 = new Thread(spider1);//Thread t2 = new Thread(spider2);t1.start();//t2.start();t1.join();//t2.join();System.out.println("the end");System.out.println(System.currentTimeMillis() - begin);    }    }

package com.core.crawl;import java.io.BufferedReader;import java.io.DataInputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;import com.core.http.Http;public class WebSpider implements Runnable{        private Http http = new Http();    private String webAddress = "";    private String destFile = "";        public void setWebAddress(String webAddress){this.webAddress = webAddress;    }        public void setDestFile (String destFile){this.destFile = destFile;    }        public boolean download() throws IOException, InterruptedException {HttpURLConnection httpConn = null;try {    URL url = new URL(webAddress);      httpConn = (HttpURLConnection) url.openConnection();    httpConn.setRequestMethod("GET");    httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14");    InputStream in = httpConn.getInputStream();    String fileType = http.fileType(httpConn.getContentType());    System.out.println(fileType);    FileOutputStream out = new FileOutputStream(new File(destFile + fileType));    int chByte = in.read();    BufferedReader bf = new BufferedReader(new InputStreamReader(in));    String result = null;    while ((result = bf.readLine()) != null) {System.out.println(result);    }//            while (chByte != -1) {//out.write(chByte);////System.out.println(chByte);//chByte = in.read();//    }            } catch (Exception ex) {    System.out.println(ex.toString());} finally {    httpConn.disconnect();}return true;    }    public void run() {try {    System.out.println(Thread.currentThread().getName());    download();    } catch (IOException e) {    e.printStackTrace();} catch (InterruptedException e) {    e.printStackTrace();}    }}

Package COM. util. file; public class files {/***** get the root directory of the application * @ return application root directory */public static string getsyspath () {return system. getproperty ("user. dir ");}}

results：

Thread-0
Html

# Robots.txt for http://www.w3.org/
#
# $ ID: robots.txt, V 1.50 17:09:37 Ted exp $
#

# For use by search.w3.org
User-Agent: W3C-gsa
Disallow:/out-of-date

User-Agent: w3t_se
Disallow:/out-of-date

User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT; MS search 4.0 robot)
Disallow :/

# W3C link checker
User-Agent: W3C-checklink
Disallow:

# Exclude some access-controlled areas
User-Agent :*
Disallow:/2004/ontaria/basic
Disallow:/team
Disallow:/Project
Disallow:/Web
Disallow:/Systems
Disallow:/History
Disallow:/out-of-date
Disallow:/2002/02/Mid
Disallow:/Mid/
Disallow:/2004/08/w3ctalks
Disallow:/2007/11/talks/search
Disallow:/people/All/
Disallow:/RDF/validator/arpservlet
Disallow:/2003/03/translations/bylanguage
Disallow:/2003/03/translations/bytechnology
Disallow:/2005/11/translations/Query
Disallow:/2003/glossary/subglossary/
# Disallow:/2005/06/blog/
# Disallow:/2001/07/pubrules-checker
# Shouldnt get transparent proxies but will ml links of things like pubrules
Disallow:/2000/06/webdata/XSLT
Disallow:/2000/09/webdata/XSLT
Disallow:/2005/08/online_xslt/XSLT
Disallow:/bugs/
Disallow:/search/mail/public/
Disallow:/2006/02/chartergen
The end
10485

Spider1.setwebaddress ("http://www.w3c.org /");
Spider1.setdestfile (files. getsyspath () + "/" + "W3C .");
Settings test by yourself

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More