最近在找java的小項目自己寫著玩,但是找不到合適的,於是寫開始學一點爬蟲,自己也是感覺爬蟲比較有趣。這裡自己找了一個教程,這一次寫的是基於Socket和http方式爬取.
小項目結構圖:
(1)SystemContorl類,實現整個爬蟲任務調度,爬取任務
package com.simple.control;import com.simple.Level.TaskLevel;import com.simple.manger.CrawlerManger;import com.simple.pojos.CrawlResultPojo;import com.simple.pojos.UrlPojo;import java.util.ArrayList;/** * * * Created by lewis on 2016/10/15. */public class SystemControl { public static void main(String []args){ ArrayList<UrlPojo> urlPojoArrayList = new ArrayList<>(); urlPojoArrayList.add(new UrlPojo("https://www.taobao.com/", TaskLevel.HIGH)); urlPojoArrayList.add(new UrlPojo("https://www.taobao.com/", TaskLevel.HIGH)); int count=0; for( UrlPojo urlPojo:urlPojoArrayList){ CrawlerManger crawlerManger = new CrawlerManger(false); CrawlResultPojo crawlResultPojo = crawlerManger.crawl(urlPojo); System.out.println(crawlResultPojo.getPageContent()); count++; System.out.println("已經抓取了:"+count+"個頁面"); } }}
(2)介面Icrawl為2種爬取方式進行統一的規範,2種爬取方式均實現此介面。
package com.simple.Icrawl;import com.simple.pojos.CrawlResultPojo;import com.simple.pojos.UrlPojo;/** * 實作類別介面 * Created by lewis on 2016/10/15. */public interface ICrawl { public CrawlResultPojo crawl(UrlPojo urlpojo);}
(3)為每個任務分優先順序
package com.simple.Level;/** * 抓取任務的level層級 * Created by lewis on 2016/10/15. */public enum TaskLevel { HIGH,MIDDLE,LOW}
(4)爬蟲的任務類和結果類
1).爬蟲所需的任務類,包含具體的爬取內容url,任務優先順序等
package com.simple.pojos;import com.simple.Level.TaskLevel;import com.simple.crawImpl.HttpUrlConnectionCrawlerImpl;import java.io.IOException;import java.io.InputStream;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;/** * @author lewis * url任務的類 * Created by lewis on 2016/10/15. */public class UrlPojo { private String url; //網頁URL private TaskLevel tasklevel=TaskLevel.MIDDLE;//URL的優先順序等級 public UrlPojo(String url) { this.url = url; } public UrlPojo(String url, TaskLevel tasklevel) { this(url); this.tasklevel = tasklevel; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public TaskLevel getTasklevel() { return tasklevel; } public void setTasklevel(TaskLevel tasklevel) { this.tasklevel = tasklevel; } public String getHost(){ //獲得主機名稱 URL Url=null; try { Url= new URL(this.url); } catch (MalformedURLException e) { e.printStackTrace(); } return Url.getHost(); } public HttpURLConnection getConnection(){ URL Url=null; try { Url= new URL(this.url); URLConnection conn = Url.openConnection(); if(conn instanceof HttpURLConnection) return (HttpURLConnection) conn; else throw new Exception("開啟銜接失敗"); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } return null; }}
2).爬取後的結果集,所有的爬取結果儲存在這個類中
package com.simple.pojos;/** * 抓取結果的封裝 * Created by lewis on 2016/10/15. */public class CrawlResultPojo { private boolean isSuccess; //是否已經成功 private String pageContent;//網頁內容 private int HttpStatuCode;//HTTP 狀態代碼 public boolean isSuccess() { return isSuccess; } public void setSuccess(boolean success) { isSuccess = success; } public String getPageContent() { return pageContent; } public void setPageContent(String pageContent) { this.pageContent = pageContent; } public int getHttpStatuCode() { return HttpStatuCode; } public void setHttpStatuCode(int httpStatuCode) { HttpStatuCode = httpStatuCode; }}
(5)爬蟲管理,包括爬取方式的選擇,查詢查詢爬取結果
package com.simple.manger;import com.simple.Icrawl.ICrawl;import com.simple.Level.TaskLevel;import com.simple.crawImpl.CrawlerImpl;import com.simple.crawImpl.HttpUrlConnectionCrawlerImpl;import com.simple.pojos.CrawlResultPojo;import com.simple.pojos.UrlPojo;import java.net.Socket;import java.util.Objects;/** * @author lewis * 包含商務邏輯的抓取管理器 * Created by lewis on 2016/10/15. */public class CrawlerManger { private ICrawl crawler; public CrawlerManger(boolean isSocket) { if(isSocket){ this.crawler = new CrawlerImpl(); }else { this.crawler = new HttpUrlConnectionCrawlerImpl(); } } public CrawlResultPojo crawl(UrlPojo urlPojo){ return this.crawler.crawl(urlPojo); }}
(6)2種爬取方式:
1).Socket方式:
package com.simple.crawImpl;import com.simple.Icrawl.ICrawl;import com.simple.Level.TaskLevel;import com.simple.pojos.CrawlResultPojo;import com.simple.pojos.UrlPojo;import java.io.*;import java.net.Socket;/** * * 實現介面類 * Created by lewis on 2016/10/15. */public class CrawlerImpl implements ICrawl{ //Socket抓取方式 @Override public CrawlResultPojo crawl(UrlPojo urlpojo) { //爬取url的內容,返回結果集合 CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); if(urlpojo==null||urlpojo.getUrl()==null) { //若url為空白,或URLpojo crawlResultPojo.setPageContent(null); crawlResultPojo.setSuccess(false); return crawlResultPojo; } String host=urlpojo.getHost(); BufferedWriter bw = null; BufferedReader br = null; Socket socket=null; if(host!=null){ try { /** * socket編程一般步驟 *(1) 建立Socket; *(2) 開啟串連到Socket的輸入/出流; *(3) 按照一定的協議對Socket進行讀/寫操作; *(4) 關閉Socket. * 其中address、host和port分別是雙向串連中另一方的IP地址、主機名稱和端 口號, * stream指明socket是流socket還是資料報socket,localPort表示本地主機的連接埠號碼, * localAddr和 bindAddr是本地機器的地址(ServerSocket的主機地址) * */ socket=new Socket(host,80); bw = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream())); /** * HTTP1.1 * 它支援持續串連. * 與之相對的 * HTTP1.0 * 當串連建立起來以後,瀏覽器發送一個請求,之後一個回應訊息被送回來.然後TCP串連被釋放. * 所以發生了阻塞 * */ bw.write("GET "+urlpojo.getUrl()+" HTTP/1.0\r\n");//HTTP/1.1會發生組成 bw.write("HOST:"+host+"\r\n"); bw.write("\r\n");//在行的結束符\r\n之前沒有任何資料,代表http head輸出給伺服器端的資料結束並完成 bw.flush(); //清空緩衝區 br=new BufferedReader(new InputStreamReader(socket.getInputStream(),"utf-8")); String line = null ; StringBuilder stringBuilder = new StringBuilder(); while((line=br.readLine())!=null){ stringBuilder.append(line+"\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBuilder.toString()); return crawlResultPojo; } catch (IOException e) { e.printStackTrace(); } finally { try { if (socket!=null)//防止出現null 指標異常 socket.close();//釋放資源,防止記憶體流失 if(br!=null) br.close(); if(bw!=null) bw.close(); } catch (IOException e) { e.printStackTrace(); System.out.println("流關閉失敗"); } } } return null; } public static void main(String []args){ CrawlerImpl cl = new CrawlerImpl(); System.out.println(cl.crawl(new UrlPojo("https://www.taobao.com/", TaskLevel.HIGH)).getPageContent()); System.out.println("done"); }}
2).HTTP方式:
package com.simple.crawImpl;import com.simple.Icrawl.ICrawl;import com.simple.Level.TaskLevel;import com.simple.pojos.CrawlResultPojo;import com.simple.pojos.UrlPojo;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.nio.Buffer;/** * Created by lewis on 2016/10/15. */public class HttpUrlConnectionCrawlerImpl implements ICrawl{ //http 抓取方式 @Override public CrawlResultPojo crawl(UrlPojo urlpojo) { CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); if(urlpojo==null||urlpojo.getUrl()==null) { //若url為空白,或URLpojo crawlResultPojo.setPageContent(null); crawlResultPojo.setSuccess(false); return crawlResultPojo; } HttpURLConnection httpURLConnection = urlpojo.getConnection(); if(httpURLConnection!=null){ BufferedReader bufferedReader=null; try { bufferedReader= new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"utf-8")); String line =null; StringBuilder stringBuilder = new StringBuilder(); while((line=bufferedReader.readLine())!=null){ stringBuilder.append(line+"\n"); } crawlResultPojo.setPageContent(stringBuilder.toString()); crawlResultPojo.setSuccess(true); return crawlResultPojo; } catch (IOException e) { e.printStackTrace(); }finally { try { if (bufferedReader!=null) bufferedReader.close(); } catch (IOException e) { e.printStackTrace(); } } } return null; } public static void main(String []args){ System.out.println(new HttpUrlConnectionCrawlerImpl().crawl(new UrlPojo("https://www.taobao.com/", TaskLevel.HIGH)).getPageContent()); System.out.println("done"); }}