Python multi-thread, asynchronous + multi-process crawler implementation code, python multi-thread
Install Tornado
The grequests library can be used directly, and the asynchronous client of tornado is used below. Tornado is used asynchronously, and a simple asynchronous crawling class is obtained according to the example modified in the official document. For more information, see the latest document.
Pip install tornado
Asynchronous Crawler
#! /Usr/bin/env python #-*-coding: UTF-8-*-import timefrom datetime import timedeltafrom tornado import httpclient, gen, ioloop, queuesimport tracebackclass AsySpider (object ): "A simple class of asynchronous spider. "def _ init _ (self, urls, concurrency = 10, ** kwargs): urls. reverse () self. urls = urls self. concurrency = concurrency self. _ q = queues. queue () self. _ fetching = set () self. _ fetched = set () Def fetch (self, url, ** kwargs): fetch = getattr (httpclient. asyncHTTPClient (), 'fetch') return fetch (url, ** kwargs) def handle_html (self, url, html): "handle html page" print (url) def handle_response (self, url, response): "" inherit and rewrite this method "if response. code == 200: self. handle_html (url, response. body) elif response. code = 599: # retry self. _ fetching. remove (url) self. _ q. pu T (url) @ gen. coroutine def get_page (self, url): try: response = yield self. fetch (url) print ('###### fetched % s' % url) failed t Exception as e: print ('exception: % s % s' % (e, url) raise gen. return (e) raise gen. return (response) @ gen. coroutine def _ run (self): @ gen. coroutine def fetch_url (): current_url = yield self. _ q. get () try: if current_url in self. _ fetching: return print ('fetching ***** % s' % curre Nt_url) self. _ fetching. add (current_url) response = yield self. get_page (current_url) self. handle_response (current_url, response) # handle reponse self. _ fetched. add (current_url) for I in range (self. concurrency): if self. urls: yield self. _ q. put (self. urls. pop () finally: self. _ q. task_done () @ gen. coroutine def worker (): while True: yield fetch_url () self. _ q. put (self. urls. pop () # add first url # Start Workers, then wait for the work queue to be empty. for _ in range (self. concurrency): worker () yield self. _ q. join (timeout = timedelta (seconds = 300000) assert self. _ fetching = self. _ fetched def run (self): io_loop = ioloop. IOLoop. current () io_loop.run_sync (self. _ run) class MySpider (AsySpider): def fetch (self, url, ** kwargs): "" override the parent fetch method to add cookies, headers, timeout and other information "" cookies_str = "PHPSESSID = j1tt6 6a829idnms56ppb70jri4; pspt = % 7B % 22id % 22% 3A % 2233153% 22% 2C % 22 pswd % 22% 3A % pushed % 22% 2C % 22_code % 22% 3A % pushed % 22% 7D; key = % E9 % 87% 8D % E5 % BA % 86% E5 % 95% E6 % 9C % A8 % E9 % B8 % 9F % E7 % BD % 84% E7 % BB % 9C % e7 % A7 % 91% E6 % 8A % 80% E6 % 9C % 89% E9 % 99% 90% E5 % 85% AC % E5 % 8F % B8; think_language = zh-cn; SERVERID = a66d7d08fa1c8b2e37dbdc6ffff82d9e | 1444973193 | 1444967835; CNZZDATA1254842228 = 1433 864393-1442821331-% 7C1444972138 "# copy the cookie string headers from the browser = {'user-agent': 'mozilla/5.0 (compatible; baiduspider/2.0; + http://www.baidu.com/search/spider.html) ', 'cookier ': cookies_str} return super (MySpider, self ). fetch (# parameter reference tornado document url, headers = headers, request_timeout = 1) def handle_html (self, url, html): print (url, html) def main (): urls = [] for page in range (1,100): urls. append ('HTTP :/ /Www.baidu.com? Page = % s' % page) s = MySpider (urls) s. run () if _ name _ = '_ main _': main ()
You can inherit this class, insert some URLs, and then rewrite the page processed by handle_page.
Asynchronous + multi-process Crawler
You can also add a process pool and use the multiprocessing module. Efficient,
#!/usr/bin/env python# -*- coding:utf-8 -*-import timefrom multiprocessing import Poolfrom datetime import timedeltafrom tornado import httpclient, gen, ioloop, queuesclass AsySpider(object): """A simple class of asynchronous spider.""" def __init__(self, urls, concurrency): urls.reverse() self.urls = urls self.concurrency = concurrency self._q = queues.Queue() self._fetching = set() self._fetched = set() def handle_page(self, url, html): filename = url.rsplit('/', 1)[1] with open(filename, 'w+') as f: f.write(html) @gen.coroutine def get_page(self, url): try: response = yield httpclient.AsyncHTTPClient().fetch(url) print('######fetched %s' % url) except Exception as e: print('Exception: %s %s' % (e, url)) raise gen.Return('') raise gen.Return(response.body) @gen.coroutine def _run(self): @gen.coroutine def fetch_url(): current_url = yield self._q.get() try: if current_url in self._fetching: return print('fetching****** %s' % current_url) self._fetching.add(current_url) html = yield self.get_page(current_url) self._fetched.add(current_url) self.handle_page(current_url, html) for i in range(self.concurrency): if self.urls: yield self._q.put(self.urls.pop()) finally: self._q.task_done() @gen.coroutine def worker(): while True: yield fetch_url() self._q.put(self.urls.pop()) # Start workers, then wait for the work queue to be empty. for _ in range(self.concurrency): worker() yield self._q.join(timeout=timedelta(seconds=300000)) assert self._fetching == self._fetched def run(self): io_loop = ioloop.IOLoop.current() io_loop.run_sync(self._run)def run_spider(beg, end): urls = [] for page in range(beg, end): urls.append('http://127.0.0.1/%s.htm' % page) s = AsySpider(urls, 10) s.run()def main(): _st = time.time() p = Pool() all_num = 73000 num = 4 # number of cpu cores per_num, left = divmod(all_num, num) s = range(0, all_num, per_num) res = [] for i in range(len(s)-1): res.append((s[i], s[i+1])) res.append((s[len(s)-1], all_num)) print res for i in res: p.apply_async(run_spider, args=(i[0], i[1],)) p.close() p.join() print time.time()-_stif __name__ == '__main__': main()
Multi-thread Crawler
Thread Pool implementation.
#! /Usr/bin/env python #-*-coding: UTF-8-*-import Queueimport sysimport requestsimport osimport threadingimport timeclass Worker (threading. thread): # process work request def _ init _ (self, workQueue, resultQueue, ** kwds): threading. thread. _ init _ (self, ** kwds) self. setDaemon (True) self. workQueue = workQueue self. resultQueue = resultQueue def run (self): while 1: try: callable, args, kwds = self. workQueue. get (False) # get task res = callable (* args, ** kwds) self. resultQueue. put (res) # put result into T Queue. empty: breakclass WorkManager: # thread pool management, create def _ init _ (self, num_of_workers = 10): self. workQueue = Queue. queue () # Request Queue self. resultQueue = Queue. queue () # Queue self for output results. workers = [] self. _ recruitThreads (num_of_workers) def _ recruitThreads (self, num_of_workers): for I in range (num_of_workers): worker = Worker (self. workQueue, self. resultQueue) # create a worker thread self. workers. append (worker) # Add to thread queue def start (self): for w in self. workers: w. start () def wait_for_complete (self): while len (self. workers): worker = self. workers. pop () # extract a thread from the pool to process the worker request. join () if worker. isAlive () and not self. workQueue. empty (): self. workers. append (worker) # re-join print 'all jobs were complete in the thread pool. 'def add_job (self, callable, * args, ** kwds): self. workQueue. put (callable, args, kwds) # Add the request def get_result (self, * args, ** kwds) to the work queue: return self. resultQueue. get (* args, ** kwds) def download_file (url): # print 'beg download', url requests. get (url ). textdef main (): try: num_of_threads = int (sys. argv [1]) handle T: num_of_threads = 10 _ st = time. time () wm = WorkManager (num_of_threads) print num_of_threads urls = ['HTTP: // www.baidu.com '] * 1000 for I in urls: wm. add_job (download_file, I) wm. start () wm. wait_for_complete () print time. time ()-_ stif _ name _ = '_ main _': main ()
These three types of casually have high efficiency, but such running will put a lot of pressure on the website server, especially for small sites, it is better to have a bit of operation.