Implementation of Java drawing Program (improved version) __java

Source: Internet
Author: User
Tags bitset readline semaphore serialization stringbuffer

Main difficulties:

1. The control of concurrent threads uses the concurrent JDK5 in the Util package

2. Go to Heavy

3. Serialization of

How to run: Java-xms128m-xmx512m-jar javacrawler.jar http://foxhq.com/C:/a.log 0 d:/pic d:/url.tmp d:/img.tmp

Simplebloomfilter.java


Package com.hengking.crawl; Import java.io.Serializable; Import Java.util.BitSet; public class Simplebloomfilter implements Serializable {/** * */private static final long serialversionuid = 1L; private final int default_size = 2 << 24; Private final int[] seeds = new int[] {7, 11, 13, 31, 37, 61,}; Private Bitset bits = new Bitset (default_size); Private simplehash[] func = new Simplehash[seeds.length]; public void Main (string[] args) {//String value = ' stone2083@yahoo.cn ';//simplebloomfilter filter = new Simplebloom Filter (); System.out.println (Filter.contains (value)); Filter.add (value); System.out.println (Filter.contains (value)); Public Simplebloomfilter () {for (int i = 0; i < seeds.length; i++) {Func[i] = new Simplehash (default_size, seeds [i]); } public void Add (String value) {for (Simplehash f:func) {Bits.set (F.hash (value), true);} \ Public Boolean contains (String value) {if (value = = null) {return false;} boolean ret = true;Hash f:func) {ret = ret && bits.get (F.hash (value);} return ret; public class Simplehash implements Serializable {private int caps; private int seed; public Simplehash (int cap, int seed {this.cap = cap; this.seed = seed;} public int hash (String value) {int: 0; int len = Value.length (); for (int i = 0; i < Len; i++) {result = Seed * result + value.charat (i);} return (cap-1) & result; @Override public String toString () {//TODO auto-generated Method stub return super.tostring ();}}

Utilseriz.java

Package com.hengking.crawl; Import java.io.*; public class Utilseriz {/** * Serializes an object into a disk file * @param * @throwsException/public static void WriteObject (Object o,string str Path) throws exception{file F=new file (strpath); if (F.exists ()) {F.delete ();} FileOutputStream os=new FileOutputStream (f); ObjectOutputStream Core class ObjectOutputStream oos=new objectoutputstream (OS); Oos.writeobject (o); Oos.close (); Os.close (); /** * deserialization, converting the disk file to Object * @paramf * @return * @throwsException/public static Object ReadObject (String strpath) throws Excep tion{file F=new file (strpath); if (!f.exists ()) {return null;} InputStream is=new FileInputStream (f); ObjectOutputStream Core class ObjectInputStream Ois=new ObjectInputStream (IS); return Ois.readobject (); } }

Searchcrawler.java

Package com.hengking.crawl; Import Java.awt.image.BufferedImage; Import Java.io.BufferedInputStream; Import Java.io.BufferedReader; Import Java.io.BufferedWriter; Import Java.io.File; Import Java.io.FileOutputStream; Import Java.io.FileWriter; Import java.io.IOException; Import Java.io.InputStreamReader; Import Java.net.URL; Import Java.text.SimpleDateFormat; Import java.util.ArrayList; Import Java.util.Calendar; Import Java.util.Date; Import Java.util.HashMap; Import Java.util.LinkedHashSet; Import java.util.concurrent.Callable; Import Java.util.concurrent.ExecutorService; Import java.util.concurrent.Executors; Import Java.util.concurrent.Semaphore; Import Java.util.regex.Matcher; Import Java.util.regex.Pattern; Import Javax.imageio.ImageIO; Import Com.hengking.crawl.po.PoCalSearch; Import Com.hengking.crawl.po.PoDownload; /*** * Description: Drawing tools * @author you forever * * */public class Searchcrawler implements runnable{/* Disallowlistcache cache robot does not allow search URLs. The robot protocol sets a robots.txt file in the root directory of the Web site, * Specify theWhich pages are restricted to search. The search program should skip these areas during the search, following is an example of robots.txt: # robots.txt for http://somehost.com/User-agent: * Disallow:/cgi-bin/disallow :/registration #/disallow robots on registration page Disallow:/login/public static simplebloomfilter Filterurl; public static Simplebloomfilter filterimg; Private hashmap< string,arraylist< string>> disallowlistcache = new hashmap< string,arraylist< String >> (); arraylist< string> errorlist= New arraylist< string> ()//error message arraylist< string> result=new < string> (); The result of the search String starturl;//the beginning of the search linkedhashset<string> tocrawllist = new linkedhashset<string> (); Boolean casesensitive=false;//whether the case-sensitive Boolean limithost=false;//searches for a private static String outdir within a restricted host; private static String Seroutdir; private static String seroutdirimg; Private Boolean blnflag=false; private static Pocalsearch Ps=null; private static Podownload Pd=null; 300 picture Analysis thread private static Executorservice Execimg Final semaphore sempimg = new semaphore (300); 30 Web page Analysis thread private static Executorservice execpage; Final semaphore semppage = new semaphore (30); Private arraylist<parsepage> arrpar=new arraylist<parsepage> (); Record capture results private static bufferedwriter bw = NULL; Public Searchcrawler (String starturl) {This.starturl=starturl.} public arraylist< string> GetResult () {return res Ult The public void Run () {//Starts the search threads new Thread (new Timewrite2file ()). Start (); blnflag=true; crawl (Starturl,limithost, CaseSensitive); }//Detect URL Format private URL verifyurl (String URL) {//Only HTTP URLs are processed. if (!url.tolowercase (). StartsWith ("http://")) return null ; URL verifiedurl = null; try {verifiedurl = new URL (URL);} catch (Exception e) {return null;} return verifiedurl; //Detect if robot is allowed to access the given URL. Private Boolean isrobotallowed (URL urltocheck) {String host = Urltocheck.gethost (). toLowerCase ()//Get the host for the Rul// SYSTEM.OUT.PRINTLN ("host =" +host); Gets the URL cache that the host does not allow to search arraylist< string> disallowlist =disAllowlistcache.get (host); If it is not yet cached, download and cache it. if (disallowlist = = null) {disallowlist = new arraylist< string> (); try {url robotsfileurl =new url ("http://" + Ho St + "/robots.txt"); BufferedReader Reader =new BufferedReader (New InputStreamReader (Robotsfileurl.openstream ())); Read the robot file to create a list of paths that are not allowed to be accessed. String Line; while (line = Reader.readline ())!= null) {if (Line.indexof ("disallow:") = = 0) {//contains "disallow:" String Disallowpath =l Ine.substring ("Disallow:". Length ());//Get No access path//check for comments. int commentindex = Disallowpath.indexof ("#"); if (Commentindex!=-1) {Disallowpath =disallowpath.substring (0, Commentindex);//Remove comment} Disallowpath = Disallowpath.trim (); Disallowlist.add (Disallowpath); }//cache the path that this host is not allowed to access. Disallowlistcache.put (host, disallowlist); The catch (Exception e) {return true;//web there is no robots.txt file in the root directory of the site, returns true} String file = Urltocheck.getfile (); System.out.println ("file getfile () =" +file); for (int i = 0; i < disallowlist.size (); i++) {String disallow = DisallowliSt.get (i); if (File.startswith (disallow)) {return false;}} return true; Private String Downloadpage (URL pageurl) {try {//Open connection to URL for reading. BufferedReader reader = new BufferedReader (New InputStreamReader (Pageurl.openstream ())); Read page into buffer. String Line; StringBuffer pagebuffer = new StringBuffer (); while (line = Reader.readline ())!= null) {pagebuffer.append (line);} return pagebuffer.tostring (); catch (Exception e) {e.printstacktrace ();} return null; //Remove the "www" private string removewwwfromurl (string url) {int = Url.indexof (": www.") from the URL; if (index!=-1) {RET Urn url.substring (0, Index + 3) + url.substring (index + 7); return (URL); //Parse the page and find the link private arraylist< string> retrievelinks (URL pageurl, String pagecontents, Boolean limithost) {//with positive The expression compiles the matching pattern of the link. Pattern P =pattern.compile ("<a//s+href//s*=//s*/"? *?) [/"|&GT;]", pattern.case_insensitive); Matcher m = P.matcher (pagecontents); arraylist< string> Linklist = new arraylist< string> (); while (M.find ()) {String link = m.group (1). Trim (); if (Link.length () < 1) {continue;}//Skip link to this page. if (Link.charat (0) = = ' # ') {continue} if (Link.indexof ("mailto:")!=-1) {continue;} if (Link.tolowercase (). IndexOf ("JavaScript")!=-1) {continue;} if (Link.indexof ("://") = = 1) {if (Link.charat (0) = = '/ ' {//processing absolutely link = "http://" + pageurl.gethost () + ":" +pageurl.getport () + link;} else {String file = Pageurl.getfile (); if (File.indexof ('/') = = 1) {//handle relative address link = "http://" + pageurl.gethost () + ":" +pa Geurl.getport () + "/" + link; else {String path =file.substring (0, File.lastindexof ('/') + 1); link = "http://" + pageurl.gethost () + ":" +pageurl.getp ORT () + path + link; } int index = Link.indexof (' # '); if (index!=-1) {link = link.substring (0, index);} link = removewwwfromurl (link); URL verifiedlink = verifyurl (link); if (Verifiedlink = = null) {Continue}/* If the host is qualified, exclude those unqualified url*/if (limithost &&!pageurl.gethost (). ToLOwercase (). Equals (Verifiedlink.gethost (). toLowerCase ())) {Continue}//Skip those links that have been processed. if (filterurl.contains (link)) {LogEvent ("matched:" +link); Continue else {filterurl.add (link);} linklist.add (link); return (linklist); //Parse the page and find the link private arraylist< string> retrieveimglinks (URL pageurl, String pagecontents, Boolean limithost) {// Compiles a linked matching pattern with a regular expression. Pattern P =pattern.compile (" linklist = new arraylist< string> (); while (M.find ()) {String link = m.group (1). Trim (); if (Link.length () < 1) {continue;}//Skip link to this page. if (Link.charat (0) = = ' # ') {continue} if (Link.indexof ("mailto:")!=-1) {continue;} if (Link.tolowercase (). IndexOf ("JavaScript")!=-1) {continue;} if (Link.tolowercase (). EndsWith ("GIF")) {continue;} if (Link.indexof ("://") = = 1) {if (Link.charat (0) = = '/') {//processing absolutely link = "http://" + pageurl.gethost () + ":" +pageurL.getport () + link; else {String file = Pageurl.getfile (); if (File.indexof ('/') = = 1) {//handle relative address link = "http://" + pageurl.gethost () + ":" + Pageurl.getport () + "/" + link; else {String path =file.substring (0, File.lastindexof ('/') + 1); link = "http://" + pageurl.gethost () + ":" +pageurl.getp ORT () + path + link; } int index = Link.indexof (' # '); if (index!=-1) {link = link.substring (0, index);} link = removewwwfromurl (link); URL verifiedlink = verifyurl (link); if (Verifiedlink = = null) {Continue}/* If the host is qualified, exclude those unqualified url*/if (limithost &&!pageurl.gethost (). toLowerCase () . Equals (Verifiedlink.gethost (). toLowerCase ())) {Continue}//Skip those links that have been processed. if (crawledlist.contains (link)) {//continue;//} if (Filterimg.contains (link)) {logevent ("The picture matches:" +link); Continue else {filterimg.add (link);} if (Link.lastindexof (". gif") ==-1) {linklist.add (link);}} return (linklist); ///Perform the actual search operation public arraylist< string> Crawl (String Starturl,boolean Limithost,boolean CAsesensitive) {//remove www starturl = Removewwwfromurl (StartURL) from the start URL; tocrawllist.add (starturl); int idxpageparse=0; whi Le (Tocrawllist.size () >0) {try {idxpageparse++;//Get URL at bottom of the list. String URL = tocrawllist.iterator (). Next (); Ps.setinturl (Ps.getinturl () +1); Remove URL from the To crawl list. Tocrawllist.remove (URL); int intretrypage=0; while (Semppage.availablepermits () <=0) {System.out.println ("There is no idle web analytics thread, wait 3 seconds to execute ..."); try {intretrypage++; if (intretrypage==10) {logevent ("Parse Web page" +url+ "timeout"); Semppage.release (); Thread.Sleep (3000); catch (Interruptedexception e) {e.printstacktrace ();}} Parsepage temppagethread=new parsepage (URL); Execpage.submit (Temppagethread); LogEvent ("Open web Analytics Thread" +idxpageparse); if (idxpageparse==1) {Thread.CurrentThread (). Sleep (30000);}} catch (Exception e) {e.printstacktrace ();}} Blnflag=false; LogEvent ("Catch the Picture Complete ..."); return result; public static void LogEvent (String strlog) {System.out.println (new SimpledateformAt (mm minute SS sec in yyyy year mm month dd). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + "=====>" +strlog); }//main function public static void main (string[] args) {if (args.length!=6) {System.out.println ("Usage:java Searchcrawler Startu RL MaxUrl searchstring "); Return } @SuppressWarnings ("unused") String strlogpath=args[1]; Searchcrawler crawler = new Searchcrawler (args[0]); outdir=args[3]+ "/pic" +new SimpleDateFormat ("YyyyMMdd"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) )+"/"; File F=new file (OutDir); if (!f.exists ()) {F.mkdir ();} execpage = Executors.newfixedthreadpool (30); Execimg = Executors.newfixedthreadpool (300); SEROUTDIR=ARGS[4]; SEROUTDIRIMG=ARGS[5]; Ps=new Pocalsearch (); Pd=new podownload (); try {if (Utilseriz.readobject (seroutdir)!=null) {System.out.println (The new SimpleDateFormat ("MM minute ss seconds" for DD Day hh, yyyy). Format (New Date (Calendar.getinstance (). Gettimeinmillis ())) + "=====>" + "Deserialize URL ..."); Filterurl= (Simplebloomfilter) utilseriz.readobject (Seroutdir); else {FilterurL=new Simplebloomfilter (); } if (Utilseriz.readobject (seroutdir)!=null) {System.out.println (The new SimpleDateFormat ("mm cent SS seconds per month DD Day hh"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ())) + "=====>" + "deserialize picture ..."); Filterimg= (Simplebloomfilter) utilseriz.readobject (seroutdirimg); else {filterimg=new simplebloomfilter ();}} catch (Exception e) {e.printstacktrace ();} String strpic=args[3]+ "/pic" +new SimpleDateFormat ("YyyyMMdd"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + ". Log"; try {bw=new BufferedWriter (new FileWriter (Strpic,false));} catch (IOException e) {//TODO auto-generated catch block E. Printstacktrace (); Thread search=new thread (crawler); System.out.println (The new SimpleDateFormat ("yyyy mm month DD Day hh when mm minute ss seconds"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + "=====>" + "start crawling ..."); System.out.println ("Download diagram:"); Search.start (); try {search.join (); LogEvent ("Main function End"); Bw.close ();} catch (Exception e) {//TODO auto-generated catch block e.printstAcktrace (); /** * Description: Download picture thread * @author binbin0915 * * */public class Imgdownthread implements runnable,callable<long>{//To be downloaded URL private String Stru; Private Boolean isstart=true; Public Imgdownthread (String strURL) {super (); This.stru = strURL} @Override public void Run () {try {sempimg.acquire () ; try{url url=new url (stru); Bufferedinputstream in = new Bufferedinputstream (Url.openstream ()); BufferedImage Bi=imageio.read (Url.openstream ()); Size requirements if (bi==null| | bi.getwidth () <30 | | bi.getheight () <30) {in.close (); return;} String ss=new SimpleDateFormat ("Yyyymmddhhmmss"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + "_" + Math.Round (Math.random () *89999999999999l+1000) +stru.substring (Stru.lastindexof (".")); String S=outdir+ss; FileOutputStream file = new FileOutputStream (new file (s)); int t; while ((t = In.read ())!=-1) {file.write (t);} file.close (); if (new file (s). Length () <=10*1024) {in.close (); new file (s). Delete (); return;} synchronized (BW) {sTring str=ss+ ":" +stru; Bw.write (str); Bw.newline (); Bw.flush (); } logevent ("Downloaded:" +stru); Ps.setintimg (Ps.getintimg () +1); In.close (); }catch (Exception e) {logevent ("********************** download Picture:" +stru+ "timeout");} catch (Exception e) {e.printstacktrace ();} finally{sempimg.release ();}} public Boolean Isstart () {return isstart.} public void Setstart (Boolean isstart) {this.isstart = Isstart;} @Override P Ublic Long Call () throws Exception {try {sempimg.acquire (); try{url url=new url (stru); Bufferedinputstream in = new Bufferedinputstream (Url.openstream ()); BufferedImage Bi=imageio.read (Url.openstream ()); Size requirements if (bi==null| | bi.getwidth () <30 | | bi.getheight () <30) {in.close (); return 0l;} String ss=new SimpleDateFormat ("Yyyymmddhhmmss"). Format (New Date (Calendar.getinstance (). Gettimeinmillis ()) + "_" + Math.Round (Math.random () *89999999999999l+1000) +stru.substring (Stru.lastindexof (".")); String S=outdir+ss; FileOutputStream file = new FileOutputStream (new file (s)); int t; While((t = In.read ())!=-1) {file.write (t);} File.close (); if (new file (s). Length () <=10*1024) {in.close (); new file (s). Delete (); return 0l;} logevent ("Downloaded:" +stru); Ps.setintimg (Ps.getintimg () +1); In.close (); }catch (Exception e) {logevent ("********************** download Picture:" +stru+ "timeout");} catch (Exception e) {e.printstacktrace ();} finally{sempimg.release (); return 1l;}} /*** * Serializes the visited URL * @author binbin0915 * * */public class Timewrite2file implements Runnable {@Override public void run ( {while (Blnflag) {try {synchronized (PS) {logevent ("Start serialization url"); Utilseriz.writeobject (Filterurl,seroutdir); LogEvent ("End of serialization url"); LogEvent ("Start serializing picture"); Utilseriz.writeobject (FILTERIMG,SEROUTDIRIMG); LogEvent ("End of serialized picture"); LogEvent ("analyzed" +ps.getinturl () + "a link"); LogEvent ("Downloaded" +ps.getintimg () + "picture"); } thread.sleep (600000); catch (Exception e) {e.printstacktrace ();}} /*** * Parse thread for URL web page * @author Administrator */class Parsepage extends thread {String url; int icount=0; public int Geticount() {return icount.} public void Seticount (int icount) {this.icount = icount;} public String GetUrl () {return URL;} p ublic void SetUrl (string url) {this.url = URL;} public parsepage (string url) {this.url=url;} @Override public void run () {try {semppage.acquire ();//Convert string URL to URL object. URL Verifiedurl = verifyurl (URL); Skip URL If robots are not allowed to access it. if (!isrobotallowed (Verifiedurl)) {Thread.CurrentThread (). Stop ();}//Add processed URLs to crawledlist String pagecontents= ""; pagecontents = Downloadpage (Verifiedurl); LogEvent ("analyzed:" +verifiedurl); LogEvent ("Number of URLs to be parsed:" +tocrawllist.size () + "a"); if (pagecontents!= null && pagecontents.length () > 0) {//Get a valid link from the page arraylist< string> links =retriev ELinks (Verifiedurl, pagecontents,limithost); Get a valid link from the page arraylist< string> imglinks =retrieveimglinks (Verifiedurl, pagecontents,limithost); Add to Picture Download queue if (Tocrawllist.size () <100000) {Tocrawllist.addall (links);} else {LogEvent ("to be dividedAnalysis of the Web page URL more than 100000 .... Skip over ... "); for (int i=0;i

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.