Main difficulties:
1. The control of concurrent threads uses the concurrent JDK5 in the Util package
2. Go to Heavy
3. Serialization of
How to run: Java-xms128m-xmx512m-jar javacrawler.jar http://foxhq.com/C:/a.log 0 d:/pic d:/url.tmp d:/img.tmp
Simplebloomfilter.java
Package com.hengking.crawl; Import java.io.Serializable; Import Java.util.BitSet; public class Simplebloomfilter implements Serializable {/** * */private static final long serialversionuid = 1L; private final int default_size = 2 << 24; Private final int[] seeds = new int[] {7, 11, 13, 31, 37, 61,}; Private Bitset bits = new Bitset (default_size); Private simplehash[] func = new Simplehash[seeds.length]; public void Main (string[] args) {//String value = ' stone2083@yahoo.cn ';//simplebloomfilter filter = new Simplebloom Filter (); System.out.println (Filter.contains (value)); Filter.add (value); System.out.println (Filter.contains (value)); Public Simplebloomfilter () {for (int i = 0; i < seeds.length; i++) {Func[i] = new Simplehash (default_size, seeds [i]); } public void Add (String value) {for (Simplehash f:func) {Bits.set (F.hash (value), true);} \ Public Boolean contains (String value) {if (value = = null) {return false;} boolean ret = true;Hash f:func) {ret = ret && bits.get (F.hash (value);} return ret; public class Simplehash implements Serializable {private int caps; private int seed; public Simplehash (int cap, int seed {this.cap = cap; this.seed = seed;} public int hash (String value) {int: 0; int len = Value.length (); for (int i = 0; i < Len; i++) {result = Seed * result + value.charat (i);} return (cap-1) & result; @Override public String toString () {//TODO auto-generated Method stub return super.tostring ();}}
Utilseriz.java
Package com.hengking.crawl; Import java.io.*; public class Utilseriz {/** * Serializes an object into a disk file * @param * @throwsException/public static void WriteObject (Object o,string str Path) throws exception{file F=new file (strpath); if (F.exists ()) {F.delete ();} FileOutputStream os=new FileOutputStream (f); ObjectOutputStream Core class ObjectOutputStream oos=new objectoutputstream (OS); Oos.writeobject (o); Oos.close (); Os.close (); /** * deserialization, converting the disk file to Object * @paramf * @return * @throwsException/public static Object ReadObject (String strpath) throws Excep tion{file F=new file (strpath); if (!f.exists ()) {return null;} InputStream is=new FileInputStream (f); ObjectOutputStream Core class ObjectInputStream Ois=new ObjectInputStream (IS); return Ois.readobject (); } }
Searchcrawler.java
Package com.hengking.crawl; Import Java.awt.image.BufferedImage; Import Java.io.BufferedInputStream; Import Java.io.BufferedReader; Import Java.io.BufferedWriter; Import Java.io.File; Import Java.io.FileOutputStream; Import Java.io.FileWriter; Import java.io.IOException; Import Java.io.InputStreamReader; Import Java.net.URL; Import Java.text.SimpleDateFormat; Import java.util.ArrayList; Import Java.util.Calendar; Import Java.util.Date; Import Java.util.HashMap; Import Java.util.LinkedHashSet; Import java.util.concurrent.Callable; Import Java.util.concurrent.ExecutorService; Import java.util.concurrent.Executors; Import Java.util.concurrent.Semaphore; Import Java.util.regex.Matcher; Import Java.util.regex.Pattern; Import Javax.imageio.ImageIO; Import Com.hengking.crawl.po.PoCalSearch; Import Com.hengking.crawl.po.PoDownload; /*** * Description: Drawing tools * @author you forever * * */public class Searchcrawler implements runnable{/* Disallowlistcache cache robot does not allow search URLs. The robot protocol sets a robots.txt file in the root directory of the Web site, * Specify theWhich pages are restricted to search. The search program should skip these areas during the search, following is an example of robots.txt: # robots.txt for http://somehost.com/User-agent: * Disallow:/cgi-bin/disallow :/registration #/disallow robots on registration page Disallow:/login/public static simplebloomfilter Filterurl; public static Simplebloomfilter filterimg; Private hashmap< string,arraylist< string>> disallowlistcache = new hashmap< string,arraylist< String >> (); arraylist< string> errorlist= New arraylist< string> ()//error message arraylist< string> result=new < string> (); The result of the search String starturl;//the beginning of the search linkedhashset<string> tocrawllist = new linkedhashset<string> (); Boolean casesensitive=false;//whether the case-sensitive Boolean limithost=false;//searches for a private static String outdir within a restricted host; private static String Seroutdir; private static String seroutdirimg; Private Boolean blnflag=false; private static Pocalsearch Ps=null; private static Podownload Pd=null; 300 picture Analysis thread private static Executorservice Execimg Final semaphore sempimg = new semaphore (300); 30 Web page Analysis thread private static Executorservice execpage; Final semaphore semppage = new semaphore (30); Private arraylist<parsepage> arrpar=new arraylist<parsepage> (); Record capture results private static bufferedwriter bw = NULL; Public Searchcrawler (String starturl) {This.starturl=starturl.} public arraylist< string> GetResult () {return res Ult The public void Run () {//Starts the search threads new Thread (new Timewrite2file ()). Start (); blnflag=true; crawl (Starturl,limithost, CaseSensitive); }//Detect URL Format private URL verifyurl (String URL) {//Only HTTP URLs are processed. if (!url.tolowercase (). StartsWith ("http://")) return null ; URL verifiedurl = null; try {verifiedurl = new URL (URL);} catch (Exception e) {return null;} return verifiedurl; //Detect if robot is allowed to access the given URL. Private Boolean isrobotallowed (URL urltocheck) {String host = Urltocheck.gethost (). toLowerCase ()//Get the host for the Rul// SYSTEM.OUT.PRINTLN ("host =" +host); Gets the URL cache that the host does not allow to search arraylist< string> disallowlist =disAllowlistcache.get (host); If it is not yet cached, download and cache it. if (disallowlist = = null) {disallowlist = new arraylist< string> (); try {url robotsfileurl =new url ("http://" + Ho St + "/robots.txt"); BufferedReader Reader =new BufferedReader (New InputStreamReader (Robotsfileurl.openstream ())); Read the robot file to create a list of paths that are not allowed to be accessed. String Line; while (line = Reader.readline ())!= null) {if (Line.indexof ("disallow:") = = 0) {//contains "disallow:" String Disallowpath =l Ine.substring ("Disallow:". Length ());//Get No access path//check for comments. int commentindex = Disallowpath.indexof ("#"); if (Commentindex!=-1) {Disallowpath =disallowpath.substring (0, Commentindex);//Remove comment} Disallowpath = Disallowpath.trim (); Disallowlist.add (Disallowpath); }//cache the path that this host is not allowed to access. Disallowlistcache.put (host, disallowlist); The catch (Exception e) {return true;//web there is no robots.txt file in the root directory of the site, returns true} String file = Urltocheck.getfile (); System.out.println ("file getfile () =" +file); for (int i = 0; i < disallowlist.size (); i++) {String disallow = DisallowliSt.get (i); if (File.startswith (disallow)) {return false;}} return true; Private String Downloadpage (URL pageurl) {try {//Open connection to URL for reading. BufferedReader reader = new BufferedReader (New InputStreamReader (Pageurl.openstream ())); Read page into buffer. String Line; StringBuffer pagebuffer = new StringBuffer (); while (line = Reader.readline ())!= null) {pagebuffer.append (line);} return pagebuffer.tostring (); catch (Exception e) {e.printstacktrace ();} return null; //Remove the "www" private string removewwwfromurl (string url) {int = Url.indexof (": www.") from the URL; if (index!=-1) {RET Urn url.substring (0, Index + 3) + url.substring (index + 7); return (URL); //Parse the page and find the link private arraylist< string> retrievelinks (URL pageurl, String pagecontents, Boolean limithost) {//with positive The expression compiles the matching pattern of the link. Pattern P =pattern.compile ("<a//s+href//s*=//s*/"? *?) [/"|>]", pattern.case_insensitive); Matcher m = P.matcher (pagecontents); arraylist< string> Linklist = new arraylist< string> (); while (M.find ()) {String link = m.group (1). Trim (); if (Link.length () < 1) {continue;}//Skip link to this page. if (Link.charat (0) = = ' # ') {continue} if (Link.indexof ("mailto:")!=-1) {continue;} if (Link.tolowercase (). IndexOf ("JavaScript")!=-1) {continue;} if (Link.indexof ("://") = = 1) {if (Link.charat (0) = = '/ ' {//processing absolutely link = "http://" + pageurl.gethost () + ":" +pageurl.getport () + link;} else {String file = Pageurl.getfile (); if (File.indexof ('/') = = 1) {//handle relative address link = "http://" + pageurl.gethost () + ":" +pa Geurl.getport () + "/" + link; else {String path =file.substring (0, File.lastindexof ('/') + 1); link = "http://" + pageurl.gethost () + ":" +pageurl.getp ORT () + path + link; } int index = Link.indexof (' # '); if (index!=-1) {link = link.substring (0, index);} link = removewwwfromurl (link); URL verifiedlink = verifyurl (link); if (Verifiedlink = = null) {Continue}/* If the host is qualified, exclude those unqualified url*/if (limithost &&!pageurl.gethost (). ToLOwercase (). Equals (Verifiedlink.gethost (). toLowerCase ())) {Continue}//Skip those links that have been processed. if (filterurl.contains (link)) {LogEvent ("matched:" +link); Continue else {filterurl.add (link);} linklist.add (link); return (linklist); //Parse the page and find the link private arraylist< string> retrieveimglinks (URL pageurl, String pagecontents, Boolean limithost) {// Compiles a linked matching pattern with a regular expression. Pattern P =pattern.compile (" linklist = new arraylist< string> (); while (M.find ()) {String link = m.group (1). Trim (); if (Link.length () < 1) {continue;}//Skip link to this page. if (Link.charat (0) = = ' # ') {continue} if (Link.indexof ("mailto:")!=-1) {continue;} if (Link.tolowercase (). IndexOf ("JavaScript")!=-1) {continue;} if (Link.tolowercase (). EndsWith ("GIF")) {continue;} if (Link.indexof ("://") = = 1) {if (Link.charat (0) = = '/') {//processing absolutely link = "http://" + pageurl.gethost () + ":" +pageurL.getport () + link; else {String file = Pageurl.getfile (); if (File.indexof ('/') = = 1) {//handle relative address link = "http://" + pageurl.gethost () + ":" + Pageurl.getport () + "/" + link; else {String path =file.substring (0, File.lastindexof ('/') + 1); link = "http://" + pageurl.gethost () + ":" +pageurl.getp ORT () + path + link; } int index = Link.indexof (' # '); if (index!=-1) {link = link.substring (0, index);} link = removewwwfromurl (link); URL verifiedlink = verifyurl (link); if (Verifiedlink = = null) {Continue}/* If the host is qualified, exclude those unqualified url*/if (limithost &&!pageurl.gethost (). toLowerCase () . Equals (Verifiedlink.gethost (). toLowerCase ())) {Continue}//Skip those links that have been processed. if (crawledlist.contains (link)) {//continue;//} if (Filterimg.contains (link)) {logevent ("The picture matches:" +link); Continue else {filterimg.add (link);} if (Link.lastindexof (". gif") ==-1) {linklist.add (link);}} return (linklist); ///Perform the actual search operation public arraylist< string> Crawl (String Starturl,boolean Limithost,boolean CAsesensitive) {//remove www starturl = Removewwwfromurl (StartURL) from the start URL; tocrawllist.add (starturl); int idxpageparse=0; whi Le (Tocrawllist.size () >0) {try {idxpageparse++;//Get URL at bottom of the list. String URL = tocrawllist.iterator (). Next (); Ps.setinturl (Ps.getinturl () +1); Remove URL from the To crawl list. Tocrawllist.remove (URL); int intretrypage=0; while (Semppage.availablepermits () <=0) {System.out.println ("There is no idle web analytics thread, wait 3 seconds to execute ..."); try {intretrypage++; if (intretrypage==10) {logevent ("Parse Web page" +url+ "timeout"); Semppage.release (); Thread.Sleep (3000); catch (Interruptedexception e) {e.printstacktrace ();}} Parsepage temppagethread=new parsepage (URL); Execpage.submit (Temppagethread); LogEvent ("Open web Analytics Thread" +idxpageparse); if (idxpageparse==1) {Thread.CurrentThread (). Sleep (30000);}} catch (Exception e) {e.printstacktrace ();}} Blnflag=false; LogEvent ("Catch the Picture Complete ..."); return result; public static void LogEvent (String strlog) {System.out.println (new SimpledateformAt (mm minute SS sec in yyyy year mm month dd). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + "=====>" +strlog); }//main function public static void main (string[] args) {if (args.length!=6) {System.out.println ("Usage:java Searchcrawler Startu RL MaxUrl searchstring "); Return } @SuppressWarnings ("unused") String strlogpath=args[1]; Searchcrawler crawler = new Searchcrawler (args[0]); outdir=args[3]+ "/pic" +new SimpleDateFormat ("YyyyMMdd"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) )+"/"; File F=new file (OutDir); if (!f.exists ()) {F.mkdir ();} execpage = Executors.newfixedthreadpool (30); Execimg = Executors.newfixedthreadpool (300); SEROUTDIR=ARGS[4]; SEROUTDIRIMG=ARGS[5]; Ps=new Pocalsearch (); Pd=new podownload (); try {if (Utilseriz.readobject (seroutdir)!=null) {System.out.println (The new SimpleDateFormat ("MM minute ss seconds" for DD Day hh, yyyy). Format (New Date (Calendar.getinstance (). Gettimeinmillis ())) + "=====>" + "Deserialize URL ..."); Filterurl= (Simplebloomfilter) utilseriz.readobject (Seroutdir); else {FilterurL=new Simplebloomfilter (); } if (Utilseriz.readobject (seroutdir)!=null) {System.out.println (The new SimpleDateFormat ("mm cent SS seconds per month DD Day hh"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ())) + "=====>" + "deserialize picture ..."); Filterimg= (Simplebloomfilter) utilseriz.readobject (seroutdirimg); else {filterimg=new simplebloomfilter ();}} catch (Exception e) {e.printstacktrace ();} String strpic=args[3]+ "/pic" +new SimpleDateFormat ("YyyyMMdd"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + ". Log"; try {bw=new BufferedWriter (new FileWriter (Strpic,false));} catch (IOException e) {//TODO auto-generated catch block E. Printstacktrace (); Thread search=new thread (crawler); System.out.println (The new SimpleDateFormat ("yyyy mm month DD Day hh when mm minute ss seconds"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + "=====>" + "start crawling ..."); System.out.println ("Download diagram:"); Search.start (); try {search.join (); LogEvent ("Main function End"); Bw.close ();} catch (Exception e) {//TODO auto-generated catch block e.printstAcktrace (); /** * Description: Download picture thread * @author binbin0915 * * */public class Imgdownthread implements runnable,callable<long>{//To be downloaded URL private String Stru; Private Boolean isstart=true; Public Imgdownthread (String strURL) {super (); This.stru = strURL} @Override public void Run () {try {sempimg.acquire () ; try{url url=new url (stru); Bufferedinputstream in = new Bufferedinputstream (Url.openstream ()); BufferedImage Bi=imageio.read (Url.openstream ()); Size requirements if (bi==null| | bi.getwidth () <30 | | bi.getheight () <30) {in.close (); return;} String ss=new SimpleDateFormat ("Yyyymmddhhmmss"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + "_" + Math.Round (Math.random () *89999999999999l+1000) +stru.substring (Stru.lastindexof (".")); String S=outdir+ss; FileOutputStream file = new FileOutputStream (new file (s)); int t; while ((t = In.read ())!=-1) {file.write (t);} file.close (); if (new file (s). Length () <=10*1024) {in.close (); new file (s). Delete (); return;} synchronized (BW) {sTring str=ss+ ":" +stru; Bw.write (str); Bw.newline (); Bw.flush (); } logevent ("Downloaded:" +stru); Ps.setintimg (Ps.getintimg () +1); In.close (); }catch (Exception e) {logevent ("********************** download Picture:" +stru+ "timeout");} catch (Exception e) {e.printstacktrace ();} finally{sempimg.release ();}} public Boolean Isstart () {return isstart.} public void Setstart (Boolean isstart) {this.isstart = Isstart;} @Override P Ublic Long Call () throws Exception {try {sempimg.acquire (); try{url url=new url (stru); Bufferedinputstream in = new Bufferedinputstream (Url.openstream ()); BufferedImage Bi=imageio.read (Url.openstream ()); Size requirements if (bi==null| | bi.getwidth () <30 | | bi.getheight () <30) {in.close (); return 0l;} String ss=new SimpleDateFormat ("Yyyymmddhhmmss"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + "_" + Math.Round (Math.random () *89999999999999l+1000) +stru.substring (Stru.lastindexof (".")); String S=outdir+ss; FileOutputStream file = new FileOutputStream (new file (s)); int t; While((t = In.read ())!=-1) {file.write (t);} File.close (); if (new file (s). Length () <=10*1024) {in.close (); new file (s). Delete (); return 0l;} logevent ("Downloaded:" +stru); Ps.setintimg (Ps.getintimg () +1); In.close (); }catch (Exception e) {logevent ("********************** download Picture:" +stru+ "timeout");} catch (Exception e) {e.printstacktrace ();} finally{sempimg.release (); return 1l;}} /*** * Serializes the visited URL * @author binbin0915 * * */public class Timewrite2file implements Runnable {@Override public void run ( {while (Blnflag) {try {synchronized (PS) {logevent ("Start serialization url"); Utilseriz.writeobject (Filterurl,seroutdir); LogEvent ("End of serialization url"); LogEvent ("Start serializing picture"); Utilseriz.writeobject (FILTERIMG,SEROUTDIRIMG); LogEvent ("End of serialized picture"); LogEvent ("analyzed" +ps.getinturl () + "a link"); LogEvent ("Downloaded" +ps.getintimg () + "picture"); } thread.sleep (600000); catch (Exception e) {e.printstacktrace ();}} /*** * Parse thread for URL web page * @author Administrator */class Parsepage extends thread {String url; int icount=0; public int Geticount() {return icount.} public void Seticount (int icount) {this.icount = icount;} public String GetUrl () {return URL;} p ublic void SetUrl (string url) {this.url = URL;} public parsepage (string url) {this.url=url;} @Override public void run () {try {semppage.acquire ();//Convert string URL to URL object. URL Verifiedurl = verifyurl (URL); Skip URL If robots are not allowed to access it. if (!isrobotallowed (Verifiedurl)) {Thread.CurrentThread (). Stop ();}//Add processed URLs to crawledlist String pagecontents= ""; pagecontents = Downloadpage (Verifiedurl); LogEvent ("analyzed:" +verifiedurl); LogEvent ("Number of URLs to be parsed:" +tocrawllist.size () + "a"); if (pagecontents!= null && pagecontents.length () > 0) {//Get a valid link from the page arraylist< string> links =retriev ELinks (Verifiedurl, pagecontents,limithost); Get a valid link from the page arraylist< string> imglinks =retrieveimglinks (Verifiedurl, pagecontents,limithost); Add to Picture Download queue if (Tocrawllist.size () <100000) {Tocrawllist.addall (links);} else {LogEvent ("to be dividedAnalysis of the Web page URL more than 100000 .... Skip over ... "); for (int i=0;i