Python implements 2 examples of multi-threaded web crawler

Last Update:2017-01-13 Source: Internet

Author: User

Tags join

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

In general, there are two modes of using threads, one is to create a function that the thread is to execute, and the function is passed into the thread object for execution. The other is to inherit directly from thread, create a new class, and put the thread execution code into this new class.

Multi-threaded web crawler, using a multi-threaded and lock mechanism, to achieve a breadth-first algorithm of the web crawler.

First give you a brief introduction of my implementation ideas:

For a web crawler, if you want to download it by breadth traversal, it's like this:

1. Download the first webpage from the given portal URL

2. Extract all new page addresses from the first page and put them in the download list

3. Download all new pages by the address in the download list

4. From all the new Web pages to find the address of the Web page has not been downloaded, update the download list

5. Repeat 3, 42 steps until the updated download list is empty table stop

Example 1

The code is as follows

Copy Code

#!/usr/bin/env python
#coding =utf-8
Import threading
Import Urllib
Import re
Import time
G_mutex=threading. Condition ()
G_pages=[] #从中解析所有url链接
G_queueurl=[] #等待爬取的url链接列表
G_existurl=[] #已经爬取过的url链接列表
G_failedurl=[] #下载失败的url链接列表
G_totalcount=0 #下载过的页面数
Class Crawler:
def __init__ (Self,crawlername,url,threadnum):
Self.crawlername=crawlername
Self.url=url
Self.threadnum=threadnum
Self.threadpool=[]
Self.logfile=file ("Log.txt", ' W ')
def craw (self):
Global G_queueurl
G_queueurl.append (URL)
Depth=0
Print self.crawlername+ "Start ..."
while (Len (G_queueurl)!=0):
Depth+=1
print ' searching depth ', depth, ' ... \ n '
Self.logfile.write ("URL:" +g_queueurl[0]+ ... ")
Self.downloadall ()
Self.updatequeueurl ()
Content= ' \n>>>depth ' +str (Depth) + ': \ n '
Self.logfile.write (content)
I=0
While I<len (G_queueurl):
Content=str (g_totalcount+i) + '-> ' +g_queueurl[i]+ ' \ n '
Self.logfile.write (content)
I+=1
def downloadall (self):
Global G_queueurl
Global G_totalcount
I=0
While I<len (G_queueurl):
J=0
While J<self.threadnum and I+j < Len (g_queueurl):
G_totalcount+=1
Threadresult=self.download (G_queueurl[i+j],str (g_totalcount) + '. html ', j)
If Threadresult!=none:
print ' Thread started: ', i+j, '--file number = ', G_totalcount
J+=1
I+=j
For thread in Self.threadpool:
Thread.Join (30)
Threadpool=[]
G_queueurl=[]
def download (Self,url,filename,tid):
Crawthread=crawlerthread (Url,filename,tid)
Self.threadpool.append (Crawthread)
Crawthread.start ()
def updatequeueurl (self):
Global G_queueurl
Global G_existurl
Newurllist=[]
For content in G_pages:
Newurllist+=self.geturl (content)
G_queueurl=list (Set (newurllist)-set (G_existurl))
def getUrl (self,content):
Reg=r ' "(http://.+?)" '
Regob=re.compile (Reg,re. Dotall)
Urllist=regob.findall (content)
Return urllist
Class Crawlerthread (threading. Thread):
def __init__ (Self,url,filename,tid):
Threading. Thread.__init__ (self)
Self.url=url
Self.filename=filename
Self.tid=tid
def run (self):
Global G_mutex
Global G_failedurl
Global G_queueurl
Try
Page=urllib.urlopen (Self.url)
Html=page.read ()
Fout=file (Self.filename, ' W ')
Fout.write (HTML)
Fout.close ()
Except Exception,e:
G_mutex.acquire ()
G_existurl.append (Self.url)
G_failedurl.append (Self.url)
G_mutex.release ()
print ' Failed downloading and saving ', Self.url
Print E
Return None
G_mutex.acquire ()
G_pages.append (HTML)
G_existurl.append (Self.url)
G_mutex.release ()
If __name__== "__main__":
Url=raw_input ("Please enter URL entry: \ n")
Threadnum=int (Raw_input ("Set Number of Threads:"))
Crawlername= "Little Reptile"
Crawler=crawler (Crawlername,url,threadnum)
Crawler.craw ()

Example 2

The code is as follows

Copy Code

#!/usr/bin/env python
#coding =utf-8
Import threading
Import Urllib
Import re
Import time

Cur=0
Last=0
Totalcount=0
Depth=0
T_mutex=threading. Condition ()

Class Mycrawler:
def __init__ (Self,crawlername,seeds,threadnum):
Self.crawlername=crawlername
Self.seeds=seeds
Self.crawqueue=crawqueue ()
Self.initqueue (self.seeds)
Self.threadnum=threadnum
Self.threadpools=[]
Self.logfile=file (' Log2.txt ', ' W ')
def initqueue (self,seeds):
If Isinstance (SEEDS,STR):
Self.crawqueue.push (Seeds)
Elif isinstance (seeds,list):
For seed in Seeds:
Self.crawqueue.push (Seed)
Global Last
Global TotalCount
Totalcount=self.crawqueue.getqueuecount ()
Last=totalcount
def crawling (self):
Global cur
Global depth
Global Last
Global TotalCount
Self.log (">>>depth" +str (Depth) + ": \ n")
While Self.crawqueue.getQueueCount ()!=0:
Url=self.crawqueue.pop ()
Self.log (URL)
If Url==none:
Continue
self.crawqueue.addToVisited (URL)
Links=self.getlinks (URL)
If Links==none:
print ' None '
Self.crawqueue.failed.append (URL)
Continue
Beforenum = Self.crawqueue.getQueueCount ()
Self.crawqueue.addLinks (links)
Afternum = Self.crawqueue.getQueueCount ()
Totalcount+=afternum-beforenum
Cur+=1
If Cur==last:
Depth+=1
Self.log (">>>depth" +str (Depth) + ": \ n")
Last=totalcount
def crawling2 (self):
Global Last
Global TotalCount
Global depth
Self.log (">>>depth" +str (Depth) + ": \ n")
Totalcount=self.crawqueue.getqueuecount ()
Last=totalcount
While Self.crawqueue.getQueueCount ()!=0:
For I in Range (Self.threadnum):
Url=self.crawqueue.pop ()
If Url==none:
Break
Crawthread=crawlerthread (url,i,self)
Self.threadpools.append (Crawthread)
Crawthread.start ()
For I in range (len (self.threadpools)):
Crawthread=self.threadpools[i]
Crawthread.join (30)
def log (self,content):
Self.logfile.write (content+ "\ n")
Class Crawlerthread (threading. Thread):
def __init__ (Self,url,tid,mycrawler):
Threading. Thread.__init__ (self)
Self.url=url
Self.tid=tid
Self.mycrawler=mycrawler
def run (self):
Global T_mutex
Global cur
Global Last
Global TotalCount
Global depth
T_mutex.acquire ()
Self.mycrawler.log (Self.url)
T_mutex.release ()
Links=self.getlinks (Self.url)
If Links==none:
T_mutex.acquire ()
Self.mycrawler.crawqueue.addToVisited (Self.url)
Self.mycrawler.crawqueue.addToFailed (Self.url)
T_mutex.release ()
Else
T_mutex.acquire ()
Self.mycrawler.crawqueue.addToVisited (Self.url)
Beforenum=self.mycrawler.crawqueue.getqueuecount ()
Self.mycrawler.crawqueue.addLinks (links)
Afternum =self.mycrawler.crawqueue.getqueuecount ()
Totalcount+=afternum-beforenum
T_mutex.release ()
T_mutex.acquire ()
Cur+=1
If Cur==last:
Depth+=1
Self.mycrawler.log (">>>depth" +str (Depth) + ": \ n")
Last=totalcount
T_mutex.release ()
def getlinks (Self,url):
Try
Page=urllib.urlopen (URL)
Html=page.read ()
Reg=r ' "(http://.+?)" '
Regob=re.compile (Reg,re. Dotall)
Links=regob.findall (HTML)
Return links
Except
print ' Failed downloading and saving ', url
Return None
Class Crawqueue:
def __init__ (self):
Self.queue=[]
Self.visited=[]
Self.failed=[]
def getqueue (self):
Return Self.queue
def getvisited (self):
Return self.visited
def getfailed (self):
Return self.failed
def push (Self,url):

Self.queue.insert (0,url)
def pop (self):
If Len (self.queue) ==0:
#print ' failed to Pop:queue are empty '
Return None
Else
Return Self.queue.pop ()
Def isempty (self):
If Len (self.queue) ==0:
Return 1
Else
return 0
def addtovisited (Self,url):
Self.visited.append (URL)
def addtofailed (Self,url):
Self.failed.append (URL)
def remove (Self,url):
Self.queue.remove (URL)
def getvisitedcount (self):
Return Len (self.visited)
def getqueuecount (self):
Return Len (self.queue)
def addlinks (self,links):
For link in Links:
Self.push (link)

If __name__== "__main__":
Seeds= "http://www.111cn.net/"
Threadnum=int (Raw_input ("Set Number of Threads:"))
Crawlername= "Little Reptile"
Mycrawler=mycrawler (Crawlername,seeds,threadnum)
Mycrawler.crawling2 ()

All right, here's the code for multithreaded web crawler implementations using Python, and hopefully the code will help you.

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More