I recently used python to do some web page analysis tasks. I haven't updated my blog for a long time. I will make it up today. The followingCodeUsed
1. Python Multithreading
2. webpage analysis Library: beautifulsoup, which is much more powerful than the previously shared Python sgmlparser webpage analysis library. If you are interested, you can solve it.
# Encoding = UTF-8
# @ Description: crawlers crawl content.
Import queue
Import threading
Import urllib, urllib2
Import time
From beautifulsoup import beautifulsoup
Hosts = [Http://www.baidu.com",Http://www.163.com"] # Web page to be crawled
Queue = queue. Queue ()
Out_queue = queue. Queue ()
class threadurl (threading. thread ):
"threaded URL grab"
def _ init _ (self, queue, out_queue):
threading. thread. _ init _ (Self)
self. queue = queue
self. out_queue = out_queue
def Run (Self):
while true:
# grabs host from queue
host = self. queue. get ()
proxy_support = urllib2.proxyhandler ({ 'http' : 'HTTP: // XXX. xxx. xxx. XXXX '}) # proxy IP
opener = urllib2.build _ opener (proxy_support, urllib2.httphandler)
urllib2.install _ opener (opener)
# Grabs URLs of hosts and then grabs chunk of webpage
Url = urllib. urlopen (host)
Chunk = URL. Read ()
# Place chunk into out queue
Self. out_queue.put (chunk)
# Signals to queue job is done
Self. queue. task_done ()
ClassDataminethread (threading. Thread ):
"Threaded URL grab """
Def_ Init _ (self, out_queue ):
Threading. thread. _ init _ (Self)
Self. out_queue = out_queue
DefRun (Self ):
WhileTrue:
# Grabs host from queue
Chunk = self. out_queue.get ()
# Parse the chunk
Soup = beautifulsoup (chunk)
PrintSoup. findall (['Title'])
# Signals to queue job is done
Self. out_queue.task_done ()
Start = Time. Time ()
DefMain ():
# Spawn a pool of threads, and pass them queue instance
T = threadurl (queue, out_queue)
T. setdaemon (true)
T. Start ()
# Populate queue with data
ForHostInHosts:
Queue. Put (host)
Dt = dataminethread (out_queue)
DT. setdaemon (true)
DT. Start ()
# Wait on the queue until everything has been processed
Queue. Join ()
Out_queue.join ()
Main ()
Print "Elapsed time:% S"% (Time. Time ()-start)
Run the precedingProgramYou need to install beautifulsoup. This is the beautifulsou document.
Share todayPython beautifulsoup multi-thread analysis Web page captureThis is where you can post the following comments on any running issues. We discuss each other.
ArticleFrom: http://www.ibm.com/developerworks/cn/aix/library/au-threadingpython/