#!/usr/bin/Env python#coding=utf-8ImportThreadingImportUrllibImportReImportTimecur=0 Last=0TotalCount=0Depth=0T_mutex=Threading. Condition ()classmycrawler:def __init__ (self,crawlername,seeds,threadnum): Self.crawlername=crawlername self.seeds=seeds Self.crawqueue=Crawqueue () self.initqueue (self.seeds) Self.threadnum=threadnum Self.threadpools=[] Self.logfile=file (' Log2.txt ', ' W ') def initqueue (self,seeds):ifisinstance (SEEDS,STR): Self.crawqueue.push (seeds) elif isinstance (seeds,list): forseed in Seeds:self.crawqueue.push (seed) Global last global TotalCount TotalCount =Self.crawqueue.getQueueCount () last=TotalCount def crawling (self): Global cur Global depth global last global TotalCount Self.log (">>>depth" +str (Depth) + ": \ n") whileSelf.crawqueue.getQueueCount ()!=0: URL=Self.crawqueue.pop () self.log (URL)ifurl==None:Continueself.crawqueue.addToVisited (URL) links=self.getlinks (URL)iflinks==None:print' None 'self.crawqueue.failed.append (URL)ContinueBeforenum=Self.crawqueue.getQueueCount () self.crawqueue.addLinks (links) Afternum=self.crawqueue.getQueueCount () totalcount+=afternum-beforenum cur+=1ifcur==last:depth+=1Self.log (">>>depth" +str (Depth) + ": \ n") Last=totalcount def crawling2 (self): Global last Global totalcount Global depth Self.log (">>>depth" +str (Depth) + ": \ n") TotalCount=Self.crawqueue.getQueueCount () last=TotalCount whileSelf.crawqueue.getQueueCount ()!=0: fori in range (self.threadnum): URL=Self.crawqueue.pop ()ifurl==None: BreakCrawthread=Crawlerthread (url,i,self) self.threadpools.append (crawthread) Crawthread.start () fori in range (len (self.threadpools)): Crawthread=Self.threadpools[i] Crawthread.join (30) def log (self,content): self.logfile.write (Content+ "\ n")classCrawlerthread (Threading. Thread): Def __init__ (Self,url,tid,mycrawler): Threading. Thread.__init__ (self) self.url=URL Self.tid=tid self.mycrawler=Mycrawler def run: Global T_mutex Global cur global Last global TotalCount Global depth T_mutex.acquire () Self.mycrawler.log (Self.url) t_mutex.release () links=self.getlinks (Self.url)iflinks==None:t_mutex.acquire () self.mycrawler.crawqueue.addToVisited (Self.url) Self.mycrawle R.crawqueue.addtofailed (Self.url) t_mutex.release ()Else: T_mutex.acquire () self.mycrawler.crawqueue.addToVisited (self.url) Beforenum=Self.mycrawler.crawqueue.getQueueCount () self.mycrawler.crawqueue.addLinks (links) Afternum=self.mycrawler.crawqueue.getQueueCount () totalcount+=afternum-beforenum t_mutex.release () t_mutex.acquire () cur+=1ifcur==last:depth+=1Self.mycrawler.log (">>>depth" +str (Depth) + ": \ n") Last=TotalCount t_mutex.release () def getlinks (Self,url):Try: Page=urllib.urlopen (URL) HTML=page.read () Reg=r ' "(http://.+?)" 'Regob=Re.compile (reg,re. Dotall) Links=regob.findall (HTML)returnlinks except:print' Failed downloading and saving ', the URLreturnNoneclasscrawqueue:def __init__ (self): Self.queue=[] self.visited=[] self.failed=[] def getqueue (self):returnself.queue def getvisited (self):returnself.visited def getfailed (self):returnself.failed def push (Self,url):ifUrl!= ""and URL not in Self.queue and URLs not in Self.visited:self.queue.insert (0, URL) def pop (self):ifLen (self.queue) ==0: #print' Failed to Pop:queue is empty 'returnNoneElse: returnSelf.queue.pop () def isEmpty (self):ifLen (self.queue) ==0: return1Else: return0def addtovisited (self,url): Self.visited.append (URL) def addtofailed (Self,url): Self.failed.appen D (URL) def remove (Self,url): Self.queue.remove (URL) def getvisitedcount (self):returnLen (self.visited) def getqueuecount (self):returnLen (self.queue) def addlinks (self,links): forlink in links:self.push (link)if__name__== "__main__": Seeds= "http://www.douban.com/"Threadnum=int(Raw_input ("Set Number of Threads:")) Crawlername= "Little Reptile"Mycrawler=Mycrawler (Crawlername,seeds,threadnum) mycrawler.crawling2 ()
Multi-threaded web crawler python implementation (ii)