In general, there are two modes of using threads, one is to create a function that the thread is to execute, and the function is passed into the thread object for execution. The other is to inherit directly from thread, create a new class, and put the thread execution code into this new class.
Multi-threaded web crawler, using a multi-threaded and lock mechanism, to achieve a breadth-first algorithm of the web crawler.
First give you a brief introduction of my implementation ideas:
For a web crawler, if you want to download it by breadth traversal, it's like this:
1. Download the first webpage from the given portal URL
2. Extract all new page addresses from the first page and put them in the download list
3. Download all new pages by the address in the download list
4. From all the new Web pages to find the address of the Web page has not been downloaded, update the download list
5. Repeat 3, 42 steps until the updated download list is empty table stop
Example 1
The code is as follows |
Copy Code |
#!/usr/bin/env python #coding =utf-8 Import threading Import Urllib Import re Import time G_mutex=threading. Condition () G_pages=[] #从中解析所有url链接 G_queueurl=[] #等待爬取的url链接列表 G_existurl=[] #已经爬取过的url链接列表 G_failedurl=[] #下载失败的url链接列表 G_totalcount=0 #下载过的页面数 Class Crawler: def __init__ (Self,crawlername,url,threadnum): Self.crawlername=crawlername Self.url=url Self.threadnum=threadnum Self.threadpool=[] Self.logfile=file ("Log.txt", ' W ') def craw (self): Global G_queueurl G_queueurl.append (URL) Depth=0 Print self.crawlername+ "Start ..." while (Len (G_queueurl)!=0): Depth+=1 print ' searching depth ', depth, ' ... \ n ' Self.logfile.write ("URL:" +g_queueurl[0]+ ... ") Self.downloadall () Self.updatequeueurl () Content= ' \n>>>depth ' +str (Depth) + ': \ n ' Self.logfile.write (content) I=0 While I<len (G_queueurl): Content=str (g_totalcount+i) + '-> ' +g_queueurl[i]+ ' \ n ' Self.logfile.write (content) I+=1 def downloadall (self): Global G_queueurl Global G_totalcount I=0 While I<len (G_queueurl): J=0 While J<self.threadnum and I+j < Len (g_queueurl): G_totalcount+=1 Threadresult=self.download (G_queueurl[i+j],str (g_totalcount) + '. html ', j) If Threadresult!=none: print ' Thread started: ', i+j, '--file number = ', G_totalcount J+=1 I+=j For thread in Self.threadpool: Thread.Join (30) Threadpool=[] G_queueurl=[] def download (Self,url,filename,tid): Crawthread=crawlerthread (Url,filename,tid) Self.threadpool.append (Crawthread) Crawthread.start () def updatequeueurl (self): Global G_queueurl Global G_existurl Newurllist=[] For content in G_pages: Newurllist+=self.geturl (content) G_queueurl=list (Set (newurllist)-set (G_existurl)) def getUrl (self,content): Reg=r ' "(http://.+?)" ' Regob=re.compile (Reg,re. Dotall) Urllist=regob.findall (content) Return urllist Class Crawlerthread (threading. Thread): def __init__ (Self,url,filename,tid): Threading. Thread.__init__ (self) Self.url=url Self.filename=filename Self.tid=tid def run (self): Global G_mutex Global G_failedurl Global G_queueurl Try Page=urllib.urlopen (Self.url) Html=page.read () Fout=file (Self.filename, ' W ') Fout.write (HTML) Fout.close () Except Exception,e: G_mutex.acquire () G_existurl.append (Self.url) G_failedurl.append (Self.url) G_mutex.release () print ' Failed downloading and saving ', Self.url Print E Return None G_mutex.acquire () G_pages.append (HTML) G_existurl.append (Self.url) G_mutex.release () If __name__== "__main__": Url=raw_input ("Please enter URL entry: \ n") Threadnum=int (Raw_input ("Set Number of Threads:")) Crawlername= "Little Reptile" Crawler=crawler (Crawlername,url,threadnum) Crawler.craw () |
Example 2
The code is as follows |
Copy Code |
#!/usr/bin/env python #coding =utf-8 Import threading Import Urllib Import re Import time Cur=0 Last=0 Totalcount=0 Depth=0 T_mutex=threading. Condition () Class Mycrawler: def __init__ (Self,crawlername,seeds,threadnum): Self.crawlername=crawlername Self.seeds=seeds Self.crawqueue=crawqueue () Self.initqueue (self.seeds) Self.threadnum=threadnum Self.threadpools=[] Self.logfile=file (' Log2.txt ', ' W ') def initqueue (self,seeds): If Isinstance (SEEDS,STR): Self.crawqueue.push (Seeds) Elif isinstance (seeds,list): For seed in Seeds: Self.crawqueue.push (Seed) Global Last Global TotalCount Totalcount=self.crawqueue.getqueuecount () Last=totalcount def crawling (self): Global cur Global depth Global Last Global TotalCount Self.log (">>>depth" +str (Depth) + ": \ n") While Self.crawqueue.getQueueCount ()!=0: Url=self.crawqueue.pop () Self.log (URL) If Url==none: Continue self.crawqueue.addToVisited (URL) Links=self.getlinks (URL) If Links==none: print ' None ' Self.crawqueue.failed.append (URL) Continue Beforenum = Self.crawqueue.getQueueCount () Self.crawqueue.addLinks (links) Afternum = Self.crawqueue.getQueueCount () Totalcount+=afternum-beforenum Cur+=1 If Cur==last: Depth+=1 Self.log (">>>depth" +str (Depth) + ": \ n") Last=totalcount def crawling2 (self): Global Last Global TotalCount Global depth Self.log (">>>depth" +str (Depth) + ": \ n") Totalcount=self.crawqueue.getqueuecount () Last=totalcount While Self.crawqueue.getQueueCount ()!=0: For I in Range (Self.threadnum): Url=self.crawqueue.pop () If Url==none: Break Crawthread=crawlerthread (url,i,self) Self.threadpools.append (Crawthread) Crawthread.start () For I in range (len (self.threadpools)): Crawthread=self.threadpools[i] Crawthread.join (30) def log (self,content): Self.logfile.write (content+ "\ n") Class Crawlerthread (threading. Thread): def __init__ (Self,url,tid,mycrawler): Threading. Thread.__init__ (self) Self.url=url Self.tid=tid Self.mycrawler=mycrawler def run (self): Global T_mutex Global cur Global Last Global TotalCount Global depth T_mutex.acquire () Self.mycrawler.log (Self.url) T_mutex.release () Links=self.getlinks (Self.url) If Links==none: T_mutex.acquire () Self.mycrawler.crawqueue.addToVisited (Self.url) Self.mycrawler.crawqueue.addToFailed (Self.url) T_mutex.release () Else T_mutex.acquire () Self.mycrawler.crawqueue.addToVisited (Self.url) Beforenum=self.mycrawler.crawqueue.getqueuecount () Self.mycrawler.crawqueue.addLinks (links) Afternum =self.mycrawler.crawqueue.getqueuecount () Totalcount+=afternum-beforenum T_mutex.release () T_mutex.acquire () Cur+=1 If Cur==last: Depth+=1 Self.mycrawler.log (">>>depth" +str (Depth) + ": \ n") Last=totalcount T_mutex.release () def getlinks (Self,url): Try Page=urllib.urlopen (URL) Html=page.read () Reg=r ' "(http://.+?)" ' Regob=re.compile (Reg,re. Dotall) Links=regob.findall (HTML) Return links Except print ' Failed downloading and saving ', url Return None Class Crawqueue: def __init__ (self): Self.queue=[] Self.visited=[] Self.failed=[] def getqueue (self): Return Self.queue def getvisited (self): Return self.visited def getfailed (self): Return self.failed def push (Self,url):
Self.queue.insert (0,url) def pop (self): If Len (self.queue) ==0: #print ' failed to Pop:queue are empty ' Return None Else Return Self.queue.pop () Def isempty (self): If Len (self.queue) ==0: Return 1 Else return 0 def addtovisited (Self,url): Self.visited.append (URL) def addtofailed (Self,url): Self.failed.append (URL) def remove (Self,url): Self.queue.remove (URL) def getvisitedcount (self): Return Len (self.visited) def getqueuecount (self): Return Len (self.queue) def addlinks (self,links): For link in Links: Self.push (link) If __name__== "__main__": Seeds= "http://www.111cn.net/" Threadnum=int (Raw_input ("Set Number of Threads:")) Crawlername= "Little Reptile" Mycrawler=mycrawler (Crawlername,seeds,threadnum) Mycrawler.crawling2 () |
All right, here's the code for multithreaded web crawler implementations using Python, and hopefully the code will help you.