How can I read the downloaded content:
package com.core.crawl;import java.io.IOException;import com.util.file.Files;public class Crawl { /** * @param args * @throws IOException * @throws InterruptedException */ public static void main(String[] args) throws IOException, InterruptedException {long begin = System.currentTimeMillis();//WebSpider spider2 = new WebSpider();WebSpider spider1 = new WebSpider();spider1.setWebAddress("http://www.w3c.org/robots.txt");spider1.setDestFile(Files.getSysPath() + "/"+"robots.");//spider2.setWebAddress("http://blog.csdn.net/longronglin");//spider2.setDestFile(Files.getSysPath() + "/"+"spider2.");Thread t1 = new Thread(spider1);//Thread t2 = new Thread(spider2);t1.start();//t2.start();t1.join();//t2.join();System.out.println("the end");System.out.println(System.currentTimeMillis() - begin); } }
package com.core.crawl;import java.io.BufferedReader;import java.io.DataInputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;import com.core.http.Http;public class WebSpider implements Runnable{ private Http http = new Http(); private String webAddress = ""; private String destFile = ""; public void setWebAddress(String webAddress){this.webAddress = webAddress; } public void setDestFile (String destFile){this.destFile = destFile; } public boolean download() throws IOException, InterruptedException {HttpURLConnection httpConn = null;try { URL url = new URL(webAddress); httpConn = (HttpURLConnection) url.openConnection(); httpConn.setRequestMethod("GET"); httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14"); InputStream in = httpConn.getInputStream(); String fileType = http.fileType(httpConn.getContentType()); System.out.println(fileType); FileOutputStream out = new FileOutputStream(new File(destFile + fileType)); int chByte = in.read(); BufferedReader bf = new BufferedReader(new InputStreamReader(in)); String result = null; while ((result = bf.readLine()) != null) {System.out.println(result); }// while (chByte != -1) {//out.write(chByte);////System.out.println(chByte);//chByte = in.read();// } } catch (Exception ex) { System.out.println(ex.toString());} finally { httpConn.disconnect();}return true; } public void run() {try { System.out.println(Thread.currentThread().getName()); download(); } catch (IOException e) { e.printStackTrace();} catch (InterruptedException e) { e.printStackTrace();} }}
Package COM. util. file; public class files {/***** get the root directory of the application * @ return application root directory */public static string getsyspath () {return system. getproperty ("user. dir ");}}
results:
Thread-0
Html
# Robots.txt for http://www.w3.org/
#
# $ ID: robots.txt, V 1.50 17:09:37 Ted exp $
#
# For use by search.w3.org
User-Agent: W3C-gsa
Disallow:/out-of-date
User-Agent: w3t_se
Disallow:/out-of-date
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT; MS search 4.0 robot)
Disallow :/
# W3C link checker
User-Agent: W3C-checklink
Disallow:
# Exclude some access-controlled areas
User-Agent :*
Disallow:/2004/ontaria/basic
Disallow:/team
Disallow:/Project
Disallow:/Web
Disallow:/Systems
Disallow:/History
Disallow:/out-of-date
Disallow:/2002/02/Mid
Disallow:/Mid/
Disallow:/2004/08/w3ctalks
Disallow:/2007/11/talks/search
Disallow:/people/All/
Disallow:/RDF/validator/arpservlet
Disallow:/2003/03/translations/bylanguage
Disallow:/2003/03/translations/bytechnology
Disallow:/2005/11/translations/Query
Disallow:/2003/glossary/subglossary/
# Disallow:/2005/06/blog/
# Disallow:/2001/07/pubrules-checker
# Shouldnt get transparent proxies but will ml links of things like pubrules
Disallow:/2000/06/webdata/XSLT
Disallow:/2000/09/webdata/XSLT
Disallow:/2005/08/online_xslt/XSLT
Disallow:/bugs/
Disallow:/search/mail/public/
Disallow:/2006/02/chartergen
The end
10485
Spider1.setwebaddress ("http://www.w3c.org /");
Spider1.setdestfile (files. getsyspath () + "/" + "W3C .");
Settings test by yourself