Python implements 2 examples of multi-threaded web crawler

Source: Internet
Author: User
Tags join

In general, there are two modes of using threads, one is to create a function that the thread is to execute, and the function is passed into the thread object for execution. The other is to inherit directly from thread, create a new class, and put the thread execution code into this new class.

Multi-threaded web crawler, using a multi-threaded and lock mechanism, to achieve a breadth-first algorithm of the web crawler.

First give you a brief introduction of my implementation ideas:

For a web crawler, if you want to download it by breadth traversal, it's like this:

1. Download the first webpage from the given portal URL

2. Extract all new page addresses from the first page and put them in the download list

3. Download all new pages by the address in the download list

4. From all the new Web pages to find the address of the Web page has not been downloaded, update the download list

5. Repeat 3, 42 steps until the updated download list is empty table stop

Example 1

The code is as follows Copy Code

#!/usr/bin/env python
#coding =utf-8
Import threading
Import Urllib
Import re
Import time
G_mutex=threading. Condition ()
G_pages=[] #从中解析所有url链接
G_queueurl=[] #等待爬取的url链接列表
G_existurl=[] #已经爬取过的url链接列表
G_failedurl=[] #下载失败的url链接列表
G_totalcount=0 #下载过的页面数
Class Crawler:
def __init__ (Self,crawlername,url,threadnum):
Self.crawlername=crawlername
Self.url=url
Self.threadnum=threadnum
Self.threadpool=[]
Self.logfile=file ("Log.txt", ' W ')
def craw (self):
Global G_queueurl
G_queueurl.append (URL)
Depth=0
Print self.crawlername+ "Start ..."
while (Len (G_queueurl)!=0):
Depth+=1
print ' searching depth ', depth, ' ... \ n '
Self.logfile.write ("URL:" +g_queueurl[0]+ ... ")
Self.downloadall ()
Self.updatequeueurl ()
Content= ' \n>>>depth ' +str (Depth) + ': \ n '
Self.logfile.write (content)
I=0
While I<len (G_queueurl):
Content=str (g_totalcount+i) + '-> ' +g_queueurl[i]+ ' \ n '
Self.logfile.write (content)
I+=1
def downloadall (self):
Global G_queueurl
Global G_totalcount
I=0
While I<len (G_queueurl):
J=0
While J<self.threadnum and I+j < Len (g_queueurl):
G_totalcount+=1
Threadresult=self.download (G_queueurl[i+j],str (g_totalcount) + '. html ', j)
If Threadresult!=none:
print ' Thread started: ', i+j, '--file number = ', G_totalcount
J+=1
I+=j
For thread in Self.threadpool:
Thread.Join (30)
Threadpool=[]
G_queueurl=[]
def download (Self,url,filename,tid):
Crawthread=crawlerthread (Url,filename,tid)
Self.threadpool.append (Crawthread)
Crawthread.start ()
def updatequeueurl (self):
Global G_queueurl
Global G_existurl
Newurllist=[]
For content in G_pages:
Newurllist+=self.geturl (content)
G_queueurl=list (Set (newurllist)-set (G_existurl))
def getUrl (self,content):
Reg=r ' "(http://.+?)" '
Regob=re.compile (Reg,re. Dotall)
Urllist=regob.findall (content)
Return urllist
Class Crawlerthread (threading. Thread):
def __init__ (Self,url,filename,tid):
Threading. Thread.__init__ (self)
Self.url=url
Self.filename=filename
Self.tid=tid
def run (self):
Global G_mutex
Global G_failedurl
Global G_queueurl
Try
Page=urllib.urlopen (Self.url)
Html=page.read ()
Fout=file (Self.filename, ' W ')
Fout.write (HTML)
Fout.close ()
Except Exception,e:
G_mutex.acquire ()
G_existurl.append (Self.url)
G_failedurl.append (Self.url)
G_mutex.release ()
print ' Failed downloading and saving ', Self.url
Print E
Return None
G_mutex.acquire ()
G_pages.append (HTML)
G_existurl.append (Self.url)
G_mutex.release ()
If __name__== "__main__":
Url=raw_input ("Please enter URL entry: \ n")
Threadnum=int (Raw_input ("Set Number of Threads:"))
Crawlername= "Little Reptile"
Crawler=crawler (Crawlername,url,threadnum)
Crawler.craw ()


Example 2

The code is as follows Copy Code

#!/usr/bin/env python
#coding =utf-8
Import threading
Import Urllib
Import re
Import time

Cur=0
Last=0
Totalcount=0
Depth=0
T_mutex=threading. Condition ()

Class Mycrawler:
def __init__ (Self,crawlername,seeds,threadnum):
Self.crawlername=crawlername
Self.seeds=seeds
Self.crawqueue=crawqueue ()
Self.initqueue (self.seeds)
Self.threadnum=threadnum
Self.threadpools=[]
Self.logfile=file (' Log2.txt ', ' W ')
def initqueue (self,seeds):
If Isinstance (SEEDS,STR):
Self.crawqueue.push (Seeds)
Elif isinstance (seeds,list):
For seed in Seeds:
Self.crawqueue.push (Seed)
Global Last
Global TotalCount
Totalcount=self.crawqueue.getqueuecount ()
Last=totalcount
def crawling (self):
Global cur
Global depth
Global Last
Global TotalCount
Self.log (">>>depth" +str (Depth) + ": \ n")
While Self.crawqueue.getQueueCount ()!=0:
Url=self.crawqueue.pop ()
Self.log (URL)
If Url==none:
Continue
self.crawqueue.addToVisited (URL)
Links=self.getlinks (URL)
If Links==none:
print ' None '
Self.crawqueue.failed.append (URL)
Continue
Beforenum = Self.crawqueue.getQueueCount ()
Self.crawqueue.addLinks (links)
Afternum = Self.crawqueue.getQueueCount ()
Totalcount+=afternum-beforenum
Cur+=1
If Cur==last:
Depth+=1
Self.log (">>>depth" +str (Depth) + ": \ n")
Last=totalcount
def crawling2 (self):
Global Last
Global TotalCount
Global depth
Self.log (">>>depth" +str (Depth) + ": \ n")
Totalcount=self.crawqueue.getqueuecount ()
Last=totalcount
While Self.crawqueue.getQueueCount ()!=0:
For I in Range (Self.threadnum):
Url=self.crawqueue.pop ()
If Url==none:
Break
Crawthread=crawlerthread (url,i,self)
Self.threadpools.append (Crawthread)
Crawthread.start ()
For I in range (len (self.threadpools)):
Crawthread=self.threadpools[i]
Crawthread.join (30)
def log (self,content):
Self.logfile.write (content+ "\ n")
Class Crawlerthread (threading. Thread):
def __init__ (Self,url,tid,mycrawler):
Threading. Thread.__init__ (self)
Self.url=url
Self.tid=tid
Self.mycrawler=mycrawler
def run (self):
Global T_mutex
Global cur
Global Last
Global TotalCount
Global depth
T_mutex.acquire ()
Self.mycrawler.log (Self.url)
T_mutex.release ()
Links=self.getlinks (Self.url)
If Links==none:
T_mutex.acquire ()
Self.mycrawler.crawqueue.addToVisited (Self.url)
Self.mycrawler.crawqueue.addToFailed (Self.url)
T_mutex.release ()
Else
T_mutex.acquire ()
Self.mycrawler.crawqueue.addToVisited (Self.url)
Beforenum=self.mycrawler.crawqueue.getqueuecount ()
Self.mycrawler.crawqueue.addLinks (links)
Afternum =self.mycrawler.crawqueue.getqueuecount ()
Totalcount+=afternum-beforenum
T_mutex.release ()
T_mutex.acquire ()
Cur+=1
If Cur==last:
Depth+=1
Self.mycrawler.log (">>>depth" +str (Depth) + ": \ n")
Last=totalcount
T_mutex.release ()
def getlinks (Self,url):
Try
Page=urllib.urlopen (URL)
Html=page.read ()
Reg=r ' "(http://.+?)" '
Regob=re.compile (Reg,re. Dotall)
Links=regob.findall (HTML)
Return links
Except
print ' Failed downloading and saving ', url
Return None
Class Crawqueue:
def __init__ (self):
Self.queue=[]
Self.visited=[]
Self.failed=[]
def getqueue (self):
Return Self.queue
def getvisited (self):
Return self.visited
def getfailed (self):
Return self.failed
def push (Self,url):

Self.queue.insert (0,url)
def pop (self):
If Len (self.queue) ==0:
#print ' failed to Pop:queue are empty '
Return None
Else
Return Self.queue.pop ()
Def isempty (self):
If Len (self.queue) ==0:
Return 1
Else
return 0
def addtovisited (Self,url):
Self.visited.append (URL)
def addtofailed (Self,url):
Self.failed.append (URL)
def remove (Self,url):
Self.queue.remove (URL)
def getvisitedcount (self):
Return Len (self.visited)
def getqueuecount (self):
Return Len (self.queue)
def addlinks (self,links):
For link in Links:
Self.push (link)

If __name__== "__main__":
Seeds= "http://www.111cn.net/"
Threadnum=int (Raw_input ("Set Number of Threads:"))
Crawlername= "Little Reptile"
Mycrawler=mycrawler (Crawlername,seeds,threadnum)
Mycrawler.crawling2 ()


All right, here's the code for multithreaded web crawler implementations using Python, and hopefully the code will help you.

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.