Java抓圖程式的實現(改進版)__java

來源:互聯網
上載者:User

主要痛點:

1.並發線程的控制 採用了JDK5的UTIL包裡的concurrent子包

2.去重

3.序列化

運行方法:java -Xms128M -Xmx512M -jar JavaCrawler.jar http://foxhq.com/ C:/a.log 0 D:/pic D:/url.tmp D:/img.tmp

SimpleBloomFilter.java


package com.hengking.crawl; import java.io.Serializable; import java.util.BitSet; public class SimpleBloomFilter implements Serializable { /** * */ private static final long serialVersionUID = 1L; private final int DEFAULT_SIZE = 2 << 24; private final int[] seeds = new int[] { 7, 11, 13, 31, 37, 61, }; private BitSet bits = new BitSet(DEFAULT_SIZE); private SimpleHash[] func = new SimpleHash[seeds.length]; // public void main(String[] args) { // String value = "stone2083@yahoo.cn"; // SimpleBloomFilter filter = new SimpleBloomFilter(); // System.out.println(filter.contains(value)); // filter.add(value); // System.out.println(filter.contains(value)); // } public SimpleBloomFilter() { for (int i = 0; i < seeds.length; i++) { func[i] = new SimpleHash(DEFAULT_SIZE, seeds[i]); } } public void add(String value) { for (SimpleHash f : func) { bits.set(f.hash(value), true); } } public boolean contains(String value) { if (value == null) { return false; } boolean ret = true; for (SimpleHash f : func) { ret = ret && bits.get(f.hash(value)); } return ret; } public class SimpleHash implements Serializable { private int cap; private int seed; public SimpleHash(int cap, int seed) { this.cap = cap; this.seed = seed; } public int hash(String value) { int result = 0; int len = value.length(); for (int i = 0; i < len; i++) { result = seed * result + value.charAt(i); } return (cap - 1) & result; } } @Override public String toString() { // TODO Auto-generated method stub return super.toString(); } }

 

UtilSeriz.java

package com.hengking.crawl; import java.io.*; public class UtilSeriz { /** *將對象序列化到磁碟檔案中 *@param *@throwsException */ public static void writeObject(Object o,String strPath) throws Exception{ File f=new File(strPath); if(f.exists()){ f.delete(); } FileOutputStream os=new FileOutputStream(f); //ObjectOutputStream 核心類 ObjectOutputStream oos=new ObjectOutputStream(os); oos.writeObject(o); oos.close(); os.close(); } /** *還原序列化,將磁碟檔案轉化為對象 *@paramf *@return *@throwsException */ public static Object readObject(String strPath) throws Exception{ File f=new File(strPath); if(!f.exists()) { return null; } InputStream is=new FileInputStream(f); //ObjectOutputStream 核心類 ObjectInputStream ois=new ObjectInputStream(is); return ois.readObject(); } }

SearchCrawler.java

package com.hengking.crawl; import java.awt.image.BufferedImage; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Semaphore; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.imageio.ImageIO; import com.hengking.crawl.po.PoCalSearch; import com.hengking.crawl.po.PoDownload; /*** * 說明:抓圖工具 * @author 君望永遠 * */ public class SearchCrawler implements Runnable{ /* disallowListCache緩衝robot不允許搜尋的URL。 Robot協議在Web網站的根目錄下設定一個robots.txt檔案, *規定網站上的哪些頁面是限制搜尋的。 搜尋程式應該在搜尋過程中跳過這些地區,下面是robots.txt的一個例子: # robots.txt for http://somehost.com/ User-agent: * Disallow: /cgi-bin/ Disallow: /registration # /Disallow robots on registration page Disallow: /login */ public static SimpleBloomFilter filterUrl; public static SimpleBloomFilter filterImg; private HashMap< String,ArrayList< String>> disallowListCache = new HashMap< String,ArrayList< String>>(); ArrayList< String> errorList= new ArrayList< String>();//錯誤資訊 ArrayList< String> result=new ArrayList< String>(); //搜尋到的結果 String startUrl;//開始搜尋的起點 LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>(); boolean caseSensitive=false;//是否區分大小寫 boolean limitHost=false;//是否在限制的主機內搜尋 private static String outdir; private static String seroutdir; private static String seroutdirimg; private boolean blnFlag=false; private static PoCalSearch ps=null; private static PoDownload pd=null; //300個圖片分析線程 private static ExecutorService execImg; final Semaphore sempImg = new Semaphore(300); //30個網頁分析線程 private static ExecutorService execPage; final Semaphore sempPage = new Semaphore(30); private ArrayList<ParsePage> arrPar=new ArrayList<ParsePage>(); //記錄抓圖結果 private static BufferedWriter bw = null; public SearchCrawler(String startUrl) { this.startUrl=startUrl; } public ArrayList< String> getResult(){ return result; } public void run(){//啟動搜尋線程 new Thread(new TimeWrite2File()).start(); blnFlag=true; crawl(startUrl,limitHost,caseSensitive); } //檢測URL格式 private URL verifyUrl(String url) { // 只處理HTTP URLs. if (!url.toLowerCase().startsWith("http://")) return null; URL verifiedUrl = null; try { verifiedUrl = new URL(url); } catch (Exception e) { return null; } return verifiedUrl; } // 檢測robot是否允許訪問給出的URL. private boolean isRobotAllowed(URL urlToCheck) { String host = urlToCheck.getHost().toLowerCase();//擷取給出RUL的主機 //System.out.println("主機="+host); // 擷取主機不允許搜尋的URL緩衝 ArrayList< String> disallowList =disallowListCache.get(host); // 如果還沒有緩衝,下載並緩衝。 if (disallowList == null) { disallowList = new ArrayList< String>(); try { URL robotsFileUrl =new URL("http://" + host + "/robots.txt"); BufferedReader reader =new BufferedReader(new InputStreamReader(robotsFileUrl.openStream())); // 讀robot檔案,建立不允許訪問的路徑列表。 String line; while ((line = reader.readLine()) != null) { if (line.indexOf("Disallow:") == 0) {//是否包含"Disallow:" String disallowPath =line.substring("Disallow:".length());//擷取不允許訪問路徑 // 檢查是否有注釋。 int commentIndex = disallowPath.indexOf("#"); if (commentIndex != - 1) { disallowPath =disallowPath.substring(0, commentIndex);//去掉注釋 } disallowPath = disallowPath.trim(); disallowList.add(disallowPath); } } // 緩衝此主機不允許訪問的路徑。 disallowListCache.put(host, disallowList); } catch (Exception e) { return true; //web網站根目錄下沒有robots.txt檔案,返回真 } } String file = urlToCheck.getFile(); //System.out.println("檔案getFile()="+file); for (int i = 0; i < disallowList.size(); i++) { String disallow = disallowList.get(i); if (file.startsWith(disallow)) { return false; } } return true; } private String downloadPage(URL pageUrl) { try { // Open connection to URL for reading. BufferedReader reader = new BufferedReader(new InputStreamReader(pageUrl.openStream())); // Read page into buffer. String line; StringBuffer pageBuffer = new StringBuffer(); while ((line = reader.readLine()) != null) { pageBuffer.append(line); } return pageBuffer.toString(); } catch (Exception e) { e.printStackTrace(); } return null; } // 從URL中去掉"www" private String removeWwwFromUrl(String url) { int index = url.indexOf("://www."); if (index != -1) { return url.substring(0, index + 3) + url.substring(index + 7); } return (url); } // 解析頁面並找出連結 private ArrayList< String> retrieveLinks(URL pageUrl, String pageContents, boolean limitHost) { // 用Regex編譯連結的匹配模式。 Pattern p =Pattern.compile("<a//s+href//s*=//s*/"?(.*?)[/"|>]",Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(pageContents); ArrayList< String> linkList = new ArrayList< String>(); while (m.find()) { String link = m.group(1).trim(); if (link.length() < 1) { continue; } // 跳過鏈到本頁面內連結。 if (link.charAt(0) == '#') { continue; } if (link.indexOf("mailto:") != -1) { continue; } if (link.toLowerCase().indexOf("javascript") != -1) { continue; } if (link.indexOf("://") == -1){ if (link.charAt(0) == '/') {//處理絕對地 link = "http://" + pageUrl.getHost()+":"+pageUrl.getPort()+ link; } else { String file = pageUrl.getFile(); if (file.indexOf('/') == -1) {//處理相對位址 link = "http://" + pageUrl.getHost()+":"+pageUrl.getPort() + "/" + link; } else { String path =file.substring(0, file.lastIndexOf('/') + 1); link = "http://" + pageUrl.getHost() +":"+pageUrl.getPort()+ path + link; } } } int index = link.indexOf('#'); if (index != -1) { link = link.substring(0, index); } link = removeWwwFromUrl(link); URL verifiedLink = verifyUrl(link); if (verifiedLink == null) { continue; } /* 如果限定主機,排除那些不合條件的URL*/ if (limitHost && !pageUrl.getHost().toLowerCase().equals( verifiedLink.getHost().toLowerCase())) { continue; } // 跳過那些已經處理的連結. if(filterUrl.contains(link)) { logEvent("匹配了:"+link); continue; } else { filterUrl.add(link); } linkList.add(link); } return (linkList); } // 解析頁面並找出連結 private ArrayList< String> retrieveImgLinks(URL pageUrl, String pageContents, boolean limitHost) { // 用Regex編譯連結的匹配模式。 Pattern p =Pattern.compile("<img//s+src//s*=//s*/"?(.*?)[/"|>]",Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(pageContents); ArrayList< String> linkList = new ArrayList< String>(); while (m.find()) { String link = m.group(1).trim(); if (link.length() < 1) { continue; } // 跳過鏈到本頁面內連結。 if (link.charAt(0) == '#') { continue; } if (link.indexOf("mailto:") != -1) { continue; } if (link.toLowerCase().indexOf("javascript") != -1) { continue; } if (link.toLowerCase().endsWith("gif")) { continue; } if (link.indexOf("://") == -1) { if (link.charAt(0) == '/') {//處理絕對地 link = "http://" + pageUrl.getHost()+":"+pageUrl.getPort()+ link; } else { String file = pageUrl.getFile(); if (file.indexOf('/') == -1) {//處理相對位址 link = "http://" + pageUrl.getHost()+":"+pageUrl.getPort() + "/" + link; } else { String path =file.substring(0, file.lastIndexOf('/') + 1); link = "http://" + pageUrl.getHost() +":"+pageUrl.getPort()+ path + link; } } } int index = link.indexOf('#'); if (index != -1) { link = link.substring(0, index); } link = removeWwwFromUrl(link); URL verifiedLink = verifyUrl(link); if (verifiedLink == null) { continue; } /* 如果限定主機,排除那些不合條件的URL*/ if (limitHost && !pageUrl.getHost().toLowerCase().equals( verifiedLink.getHost().toLowerCase())) { continue; } // 跳過那些已經處理的連結. // if (crawledList.contains(link)) { // continue; // } if(filterImg.contains(link)) { logEvent("圖片匹配了:"+link); continue; } else { filterImg.add(link); } if(link.lastIndexOf(".gif")==-1) { linkList.add(link); } } return (linkList); } //執行實際的搜尋操作 public ArrayList< String> crawl(String startUrl,boolean limithost,boolean caseSensitive ) { // 從開始URL中移出www startUrl = removeWwwFromUrl(startUrl); toCrawlList.add(startUrl); int idxPageParse=0; while (toCrawlList.size()>0) { try { idxPageParse++; // Get URL at bottom of the list. String url = toCrawlList.iterator().next(); ps.setIntUrl(ps.getIntUrl()+1); // Remove URL from the to crawl list. toCrawlList.remove(url); int intRetryPage=0; while (sempPage.availablePermits()<=0) { System.out.println("暫時沒有閒置網頁分析線程,等待3秒再執行..."); try { intRetryPage++; if(intRetryPage==10) { logEvent("分析網頁"+url+"逾時"); sempPage.release(); break; } Thread.sleep(3000); } catch (InterruptedException e) { e.printStackTrace(); } } ParsePage tempPageThread=new ParsePage(url); execPage.submit(tempPageThread); logEvent("開啟網頁分析線程"+idxPageParse); if(idxPageParse==1) { Thread.currentThread().sleep(30000); } }catch(Exception e) { e.printStackTrace(); } } blnFlag=false; logEvent("抓圖完成......"); return result; } public static void logEvent(String strLog) { System.out.println( new SimpleDateFormat("yyyy年MM月dd日HH時mm分ss秒").format(new Date(Calendar.getInstance().getTimeInMillis()))+"=====>"+strLog); } // 主函數 public static void main(String[] args) { if(args.length!=6) { System.out.println("Usage:java SearchCrawler startUrl maxUrl searchString"); return; } @SuppressWarnings("unused") String strLogPath=args[1]; SearchCrawler crawler = new SearchCrawler(args[0]); outdir=args[3]+"/pic"+new SimpleDateFormat("yyyyMMdd").format(new Date(Calendar.getInstance().getTimeInMillis()))+"/"; File f=new File(outdir); if(!f.exists()) { f.mkdir(); } execPage = Executors.newFixedThreadPool(30); execImg = Executors.newFixedThreadPool(300); seroutdir=args[4]; seroutdirimg=args[5]; ps=new PoCalSearch(); pd=new PoDownload(); try { if(UtilSeriz.readObject(seroutdir)!=null) { System.out.println(new SimpleDateFormat("yyyy年MM月dd日HH時mm分ss秒").format(new Date(Calendar.getInstance().getTimeInMillis()))+"=====>"+"還原序列化URL..."); filterUrl=(SimpleBloomFilter)UtilSeriz.readObject(seroutdir); } else { filterUrl=new SimpleBloomFilter(); } if(UtilSeriz.readObject(seroutdir)!=null) { System.out.println(new SimpleDateFormat("yyyy年MM月dd日HH時mm分ss秒").format(new Date(Calendar.getInstance().getTimeInMillis()))+"=====>"+"還原序列化圖片..."); filterImg=(SimpleBloomFilter)UtilSeriz.readObject(seroutdirimg); } else { filterImg=new SimpleBloomFilter(); } } catch (Exception e) { e.printStackTrace(); } String strPic=args[3]+"/pic"+new SimpleDateFormat("yyyyMMdd").format(new Date(Calendar.getInstance().getTimeInMillis()))+".log"; try { bw=new BufferedWriter(new FileWriter(strPic,false)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } Thread search=new Thread(crawler); System.out.println( new SimpleDateFormat("yyyy年MM月dd日HH時mm分ss秒").format(new Date(Calendar.getInstance().getTimeInMillis()))+"=====>"+"開始爬圖..."); System.out.println("下載了圖:"); search.start(); try { search.join(); logEvent("主函數結束"); bw.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 說明:下載圖片的線程 * @author binbin0915 * */ public class ImgDownThread implements Runnable,Callable<Long>{ //待下載的URL private String stru; private boolean isStart=true; public ImgDownThread(String strurl) { super(); this.stru = strurl; } @Override public void run() { try { sempImg.acquire(); try{ URL url=new URL(stru); BufferedInputStream in = new BufferedInputStream(url.openStream()); BufferedImage bi=ImageIO.read(url.openStream()); //尺寸要求 if (bi==null|| bi.getWidth()<30 || bi.getHeight()<30 ) { in.close(); return; } String ss=new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(Calendar.getInstance().getTimeInMillis()))+"_"+Math.round(Math.random()*89999999999999L+1000)+stru.substring(stru.lastIndexOf(".")); String s=outdir+ss; FileOutputStream file = new FileOutputStream(new File(s)); int t; while ((t = in.read()) != -1) { file.write(t); } file.close(); if(new File(s).length()<=10*1024) { in.close(); new File(s).delete(); return; } synchronized(bw) { String str=ss+":"+stru; bw.write(str); bw.newLine(); bw.flush(); } logEvent("下載了:"+stru); ps.setIntImg(ps.getIntImg()+1); in.close(); }catch(Exception e){ logEvent("**********************下載圖片:"+stru+"逾時"); } } catch (Exception e) { e.printStackTrace(); } finally{ sempImg.release(); } } public boolean isStart() { return isStart; } public void setStart(boolean isStart) { this.isStart = isStart; } @Override public Long call() throws Exception { try { sempImg.acquire(); try{ URL url=new URL(stru); BufferedInputStream in = new BufferedInputStream(url.openStream()); BufferedImage bi=ImageIO.read(url.openStream()); //尺寸要求 if (bi==null|| bi.getWidth()<30 || bi.getHeight()<30 ) { in.close(); return 0l; } String ss=new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(Calendar.getInstance().getTimeInMillis()))+"_"+Math.round(Math.random()*89999999999999L+1000)+stru.substring(stru.lastIndexOf(".")); String s=outdir+ss; FileOutputStream file = new FileOutputStream(new File(s)); int t; while ((t = in.read()) != -1) { file.write(t); } file.close(); if(new File(s).length()<=10*1024) { in.close(); new File(s).delete(); return 0l; } logEvent("下載了:"+stru); ps.setIntImg(ps.getIntImg()+1); in.close(); }catch(Exception e){ logEvent("**********************下載圖片:"+stru+"逾時"); } } catch (Exception e) { e.printStackTrace(); } finally{ sempImg.release(); return 1l; } } } /*** * 序列化已訪問的URL * @author binbin0915 * */ public class TimeWrite2File implements Runnable { @Override public void run() { while(blnFlag) { try { synchronized(ps) { logEvent("開始序列化URL"); UtilSeriz.writeObject(filterUrl,seroutdir); logEvent("結束序列化URL"); logEvent("開始序列化圖片"); UtilSeriz.writeObject(filterImg,seroutdirimg); logEvent("結束序列化圖片"); logEvent("分析了"+ps.getIntUrl()+"個連結"); logEvent("下載了"+ps.getIntImg()+"張圖片"); } Thread.sleep(600000); } catch (Exception e) { e.printStackTrace(); } } } } /*** * 分析對應URL網頁的線程 * @author Administrator * */ class ParsePage extends Thread { String url; int iCount=0; public int getiCount() { return iCount; } public void setiCount(int iCount) { this.iCount = iCount; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public ParsePage(String url) { this.url=url; } @Override public void run() { try { sempPage.acquire(); // Convert string url to URL object. URL verifiedUrl = verifyUrl(url); // Skip URL if robots are not allowed to access it. if (!isRobotAllowed(verifiedUrl)) { Thread.currentThread().stop(); } // 增加已處理的URL到crawledList String pageContents=""; pageContents = downloadPage(verifiedUrl); logEvent("分析了:"+verifiedUrl); logEvent("待分析URL數:"+toCrawlList.size()+"個"); if (pageContents != null && pageContents.length() > 0) { // 從頁面中擷取有效連結 ArrayList< String> links =retrieveLinks(verifiedUrl, pageContents,limitHost); // 從頁面中擷取有效連結 ArrayList< String> imglinks =retrieveImgLinks(verifiedUrl, pageContents,limitHost); //添加到圖片下載隊列 if(toCrawlList.size()<100000) { toCrawlList.addAll(links); } else { logEvent("待分析的網頁URL超過100000。。。。跳過......."); } for(int i=0;i<imglinks.size();i++) { if(imglinks.get(i).indexOf("http:")!=-1) { iCount++; filterImg.add(imglinks.get(i)); ps.setIntImg(ps.getIntImg()+1); int intRetryImg=0; while (sempImg.availablePermits() <= 0) { System.out.println("暫時沒有閒置抓圖線程,等待3秒再執行..."); try { intRetryImg++; if(intRetryImg==10) { logEvent("抓圖"+imglinks.get(i)+"逾時"); sempImg.release(); } Thread.sleep(3000); } catch (InterruptedException e) { e.printStackTrace(); } } Thread tempImgThread=new Thread(new ImgDownThread(imglinks.get(i))); execImg.submit(tempImgThread); if((iCount!=1) && (iCount%10==1) ) { try { logEvent("圖多休息2秒......"); Thread.currentThread().sleep(2000); } catch (InterruptedException e) { e.printStackTrace(); } } } } } synchronized(arrPar) { arrPar.remove(this); } } catch(Exception e) { e.printStackTrace(); } finally { sempPage.release(); } } } }

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.