Python multi-thread, asynchronous + multi-process crawler implementation code, python multi-thread

Source: Internet
Author: User
Tags website server timedelta

Python multi-thread, asynchronous + multi-process crawler implementation code, python multi-thread

Install Tornado
The grequests library can be used directly, and the asynchronous client of tornado is used below. Tornado is used asynchronously, and a simple asynchronous crawling class is obtained according to the example modified in the official document. For more information, see the latest document.
Pip install tornado

Asynchronous Crawler

#! /Usr/bin/env python #-*-coding: UTF-8-*-import timefrom datetime import timedeltafrom tornado import httpclient, gen, ioloop, queuesimport tracebackclass AsySpider (object ): "A simple class of asynchronous spider. "def _ init _ (self, urls, concurrency = 10, ** kwargs): urls. reverse () self. urls = urls self. concurrency = concurrency self. _ q = queues. queue () self. _ fetching = set () self. _ fetched = set () Def fetch (self, url, ** kwargs): fetch = getattr (httpclient. asyncHTTPClient (), 'fetch') return fetch (url, ** kwargs) def handle_html (self, url, html): "handle html page" print (url) def handle_response (self, url, response): "" inherit and rewrite this method "if response. code == 200: self. handle_html (url, response. body) elif response. code = 599: # retry self. _ fetching. remove (url) self. _ q. pu T (url) @ gen. coroutine def get_page (self, url): try: response = yield self. fetch (url) print ('###### fetched % s' % url) failed t Exception as e: print ('exception: % s % s' % (e, url) raise gen. return (e) raise gen. return (response) @ gen. coroutine def _ run (self): @ gen. coroutine def fetch_url (): current_url = yield self. _ q. get () try: if current_url in self. _ fetching: return print ('fetching ***** % s' % curre Nt_url) self. _ fetching. add (current_url) response = yield self. get_page (current_url) self. handle_response (current_url, response) # handle reponse self. _ fetched. add (current_url) for I in range (self. concurrency): if self. urls: yield self. _ q. put (self. urls. pop () finally: self. _ q. task_done () @ gen. coroutine def worker (): while True: yield fetch_url () self. _ q. put (self. urls. pop () # add first url # Start Workers, then wait for the work queue to be empty. for _ in range (self. concurrency): worker () yield self. _ q. join (timeout = timedelta (seconds = 300000) assert self. _ fetching = self. _ fetched def run (self): io_loop = ioloop. IOLoop. current () io_loop.run_sync (self. _ run) class MySpider (AsySpider): def fetch (self, url, ** kwargs): "" override the parent fetch method to add cookies, headers, timeout and other information "" cookies_str = "PHPSESSID = j1tt6 6a829idnms56ppb70jri4; pspt = % 7B % 22id % 22% 3A % 2233153% 22% 2C % 22 pswd % 22% 3A % pushed % 22% 2C % 22_code % 22% 3A % pushed % 22% 7D; key = % E9 % 87% 8D % E5 % BA % 86% E5 % 95% E6 % 9C % A8 % E9 % B8 % 9F % E7 % BD % 84% E7 % BB % 9C % e7 % A7 % 91% E6 % 8A % 80% E6 % 9C % 89% E9 % 99% 90% E5 % 85% AC % E5 % 8F % B8; think_language = zh-cn; SERVERID = a66d7d08fa1c8b2e37dbdc6ffff82d9e | 1444973193 | 1444967835; CNZZDATA1254842228 = 1433 864393-1442821331-% 7C1444972138 "# copy the cookie string headers from the browser = {'user-agent': 'mozilla/5.0 (compatible; baiduspider/2.0; + http://www.baidu.com/search/spider.html) ', 'cookier ': cookies_str} return super (MySpider, self ). fetch (# parameter reference tornado document url, headers = headers, request_timeout = 1) def handle_html (self, url, html): print (url, html) def main (): urls = [] for page in range (1,100): urls. append ('HTTP :/ /Www.baidu.com? Page = % s' % page) s = MySpider (urls) s. run () if _ name _ = '_ main _': main ()

You can inherit this class, insert some URLs, and then rewrite the page processed by handle_page.

Asynchronous + multi-process Crawler
You can also add a process pool and use the multiprocessing module. Efficient,

#!/usr/bin/env python# -*- coding:utf-8 -*-import timefrom multiprocessing import Poolfrom datetime import timedeltafrom tornado import httpclient, gen, ioloop, queuesclass AsySpider(object):  """A simple class of asynchronous spider."""  def __init__(self, urls, concurrency):    urls.reverse()    self.urls = urls    self.concurrency = concurrency    self._q = queues.Queue()    self._fetching = set()    self._fetched = set()  def handle_page(self, url, html):    filename = url.rsplit('/', 1)[1]    with open(filename, 'w+') as f:      f.write(html)  @gen.coroutine  def get_page(self, url):    try:      response = yield httpclient.AsyncHTTPClient().fetch(url)      print('######fetched %s' % url)    except Exception as e:      print('Exception: %s %s' % (e, url))      raise gen.Return('')    raise gen.Return(response.body)  @gen.coroutine  def _run(self):    @gen.coroutine    def fetch_url():      current_url = yield self._q.get()      try:        if current_url in self._fetching:          return        print('fetching****** %s' % current_url)        self._fetching.add(current_url)        html = yield self.get_page(current_url)        self._fetched.add(current_url)        self.handle_page(current_url, html)        for i in range(self.concurrency):          if self.urls:            yield self._q.put(self.urls.pop())      finally:        self._q.task_done()    @gen.coroutine    def worker():      while True:        yield fetch_url()    self._q.put(self.urls.pop())    # Start workers, then wait for the work queue to be empty.    for _ in range(self.concurrency):      worker()    yield self._q.join(timeout=timedelta(seconds=300000))    assert self._fetching == self._fetched  def run(self):    io_loop = ioloop.IOLoop.current()    io_loop.run_sync(self._run)def run_spider(beg, end):  urls = []  for page in range(beg, end):    urls.append('http://127.0.0.1/%s.htm' % page)  s = AsySpider(urls, 10)  s.run()def main():  _st = time.time()  p = Pool()  all_num = 73000  num = 4  # number of cpu cores  per_num, left = divmod(all_num, num)  s = range(0, all_num, per_num)  res = []  for i in range(len(s)-1):    res.append((s[i], s[i+1]))  res.append((s[len(s)-1], all_num))  print res  for i in res:    p.apply_async(run_spider, args=(i[0], i[1],))  p.close()  p.join()  print time.time()-_stif __name__ == '__main__':  main()

Multi-thread Crawler
Thread Pool implementation.

#! /Usr/bin/env python #-*-coding: UTF-8-*-import Queueimport sysimport requestsimport osimport threadingimport timeclass Worker (threading. thread): # process work request def _ init _ (self, workQueue, resultQueue, ** kwds): threading. thread. _ init _ (self, ** kwds) self. setDaemon (True) self. workQueue = workQueue self. resultQueue = resultQueue def run (self): while 1: try: callable, args, kwds = self. workQueue. get (False) # get task res = callable (* args, ** kwds) self. resultQueue. put (res) # put result into T Queue. empty: breakclass WorkManager: # thread pool management, create def _ init _ (self, num_of_workers = 10): self. workQueue = Queue. queue () # Request Queue self. resultQueue = Queue. queue () # Queue self for output results. workers = [] self. _ recruitThreads (num_of_workers) def _ recruitThreads (self, num_of_workers): for I in range (num_of_workers): worker = Worker (self. workQueue, self. resultQueue) # create a worker thread self. workers. append (worker) # Add to thread queue def start (self): for w in self. workers: w. start () def wait_for_complete (self): while len (self. workers): worker = self. workers. pop () # extract a thread from the pool to process the worker request. join () if worker. isAlive () and not self. workQueue. empty (): self. workers. append (worker) # re-join print 'all jobs were complete in the thread pool. 'def add_job (self, callable, * args, ** kwds): self. workQueue. put (callable, args, kwds) # Add the request def get_result (self, * args, ** kwds) to the work queue: return self. resultQueue. get (* args, ** kwds) def download_file (url): # print 'beg download', url requests. get (url ). textdef main (): try: num_of_threads = int (sys. argv [1]) handle T: num_of_threads = 10 _ st = time. time () wm = WorkManager (num_of_threads) print num_of_threads urls = ['HTTP: // www.baidu.com '] * 1000 for I in urls: wm. add_job (download_file, I) wm. start () wm. wait_for_complete () print time. time ()-_ stif _ name _ = '_ main _': main ()

These three types of casually have high efficiency, but such running will put a lot of pressure on the website server, especially for small sites, it is better to have a bit of operation.

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.