Python multi-threaded, asynchronous + multi-process crawler implementation code

Source: Internet
Author: User
Installing Tornado
Easy to use Grequests library directly, the following is the tornado of the asynchronous client. Asynchronously used the tornado, based on an example of an official document, to get a simple asynchronous crawler. You can refer to the latest documentation under learning.
Pip Install Tornado

Asynchronous crawler

#!/usr/bin/env python#-*-coding:utf-8-*-import timefrom datetime import timedeltafrom Tornado import httpclient, Gen, I  Oloop, Queuesimport tracebackclass Asyspider (object): "" "A simple class of asynchronous Spider." "" " def __init__ (self, URLs, concurrency=10, **kwargs): Urls.reverse () self.urls = urls self.concurrency = Concurrenc Y self._q = queues. Queue () self._fetching = set () self._fetched = set () def fetch (self, URL, **kwargs): Fetch = GetAttr (httpclient.    Asynchttpclient (), ' fetch ') return fetch (URL, **kwargs) def handle_html (self, URL, html): "" "Handle HTML Page" "" Print (URL) def handle_response (self, URL, response): "" "Inherit and Rewrite This method" "" If response.code = = 200 : self.handle_html (URL, response.body) elif Response.code = = 599: # retry self._fetching.remove (URL) se Lf._q.put (URL) @gen. Coroutine def get_page (self, url): try:response = yield self.fetch (URL) print (' ##### #f Etched%s '% URL) exCept Exception as E:print (' Exception:%s%s '% (e, url)) Raise Gen. Return (e) Raise Gen. Return (response) @gen. Coroutine def _run (self): @gen. Coroutine def fetch_url (): Current_url = yield self._q.g        ET () try:if current_url in Self._fetching:return print (' fetching******%s '% current_url) Self._fetching.add (current_url) response = yield Self.get_page (current_url) self.handle_response (curren          T_url, Response) # handle reponse Self._fetched.add (current_url) for I in Range (self.concurrency):    If Self.urls:yield self._q.put (Self.urls.pop ()) Finally:self._q.task_done () @gen. coroutine def worker (): While True:yield Fetch_url () self._q.put (Self.urls.pop ()) # Add First URL # Start Worke    RS, then wait for the work queue to be empty. For _ in range (self.concurrency): Worker () yield Self._q.join (Timeout=timedelta (seconds=300000)) Assert selF._fetching = = self._fetched def run (self): Io_loop = Ioloop. Ioloop.current () Io_loop.run_sync (Self._run) class Myspider (Asyspider): def fetch (self, URL, **kwargs): "" Overrides the parent fetch Method can add cookies,headers,timeout and other Information "" "COOKIES_STR =" PHPSESSID=J1TT66A829IDNMS56PPB70JRI4; pspt=%7b%22id%22%3a%2233153%22%2c%22pswd%22%3a%228835d2c1351d221b4ab016fbf9e8253f%22%2c%22_code%22%3a% 22f779dcd011f4e2581c716d1e1b945861%22%7d; key=%e9%87%8d%e5%ba%86%e5%95%84%e6%9c%a8%e9%b8%9f%e7%bd%91%e7%bb%9c%e7%a7%91%e6%8a%80%e6%9c%89%e9%99%90%e5%85% Ac%e5%8f%b8; THINK_LANGUAGE=ZH-CN; serverid=a66d7d08fa1c8b2e37dbdc6ffff82d9e|1444973193|1444967835; cnzzdata1254842228=1433864393-1442810831-%7c1444972138 "# Copy cookie string from browser headers = {' user-agent ': ' mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html) ', ' Cookie ': Cookies_str} return Super ( Myspider, self). FETCH (# Parameter Reference Tornado document URL, headers=headers, request_timeout=1) def handle_html (self, URL, html ): Print (URL, HTML) def main (): URLs = [] for page in range (1, +): Urls.append (' http://www.baidu.com?page=%s '% pa GE) s = myspider (URLs) s.run () if __name__ = = ' __main__ ': Main ()

You can inherit from this class, plug in some URLs, and rewrite the pages that handle_page handles.

Asynchronous + multi-process crawler
You can also pervert point, add a process pool, using the multiprocessing module. The efficiency of the

#!/usr/bin/env python#-*-coding:utf-8-*-import timefrom multiprocessing import poolfrom datetime import Timedeltafrom t  Ornado import HttpClient, Gen, Ioloop, Queuesclass Asyspider (object): "" "A simple class of asynchronous Spider." " def __init__ (self, URLs, concurrency): urls.reverse () self.urls = urls self.concurrency = concurrency self._q = Queues. Queue () self._fetching = set () self._fetched = set () def handle_page (self, URL, html): filename = url.rsplit ('/' , 1) [1] with open (filename, ' w+ ') as F:f.write (HTML) @gen. Coroutine def get_page (self, url): Try:respo NSE = Yield httpclient. Asynchttpclient (). Fetch (URL) print (' ##### #fetched%s '% URL) except Exception as E:print (' Exception:%s '%s ') % (e, url)) Raise Gen. Return (') Raise Gen. Return (Response.body) @gen. Coroutine def _run (self): @gen. Coroutine def fetch_url (): Current_url = yield self        . _q.get () try:if Current_url in self._fetching:  return print (' fetching******%s '% current_url) Self._fetching.add (current_url) HTML = yield SELF.G Et_page (Current_url) self._fetched.add (current_url) self.handle_page (Current_url, HTML) for I in rang E (self.concurrency): If Self.urls:yield self._q.put (Self.urls.pop ()) Finally:self._q.tas  K_done () @gen. Coroutine def worker (): While True:yield Fetch_url () self._q.put (Self.urls.pop ()) #    Start workers, then wait for the work queue to be empty. For _ in range (self.concurrency): Worker () yield Self._q.join (Timeout=timedelta (seconds=300000)) Assert Self._f etching = = Self._fetched def run (self): Io_loop = Ioloop. Ioloop.current () Io_loop.run_sync (Self._run) def run_spider (Beg, end): URLs = [] for page in range (Beg, end): URLs. Append (' http://127.0.0.1/%s.htm '% page) s = asyspider (URLs, Ten) S.run () def main (): _st = Time.time () p = Pool () All_ num = 73000 num = 4 #Number of CPU cores per_num, left = Divmod (All_num, num) s = range (0, All_num, per_num) res = [] for i in range (Len (s) -1): Res.append ((S[i], s[i+1)) Res.append ((S[len (s)-1], all_num)) print res for i in Res:p.apply_async (run_spid Er, args= (i[0], i[1],) p.close () p.join () print time.time ()-_stif __name__ = ' __main__ ': Main ()

Multi-threaded Crawler
Thread pool implementation.

#!/usr/bin/env python#-*-coding:utf-8-*-import queueimport sysimport requestsimport osimport threadingimport Timeclass Worker (Threading. Thread): # processing work request Def __init__ (self, workQueue, Resultqueue, **kwds): Threading.  Thread.__init__ (self, **kwds) Self.setdaemon (True) self.workqueue = WorkQueue Self.resultqueue = Resultqueue def Run (self): while 1:try:callable, args, Kwds = Self.workQueue.get (False) # get Task res = CALLABL  E (*args, **kwds) Self.resultQueue.put (res) # put result except Queue.Empty:breakclass WorkManager: # Thread pool management, creating Def __init__ (Self, num_of_workers=10): Self.workqueue = Queue.queue () # request Queue Self.resultqueue = Queue.qu    Eue () # output Queue self.workers = [] Self._recruitthreads (num_of_workers) def _recruitthreads (self, num_of_workers): For I in Range (num_of_workers): worker = Worker (Self.workqueue, Self.resultqueue) # Create worker thread Self.workers.appe nd (worker) # Join to thread queue def start (self): For W in Self.workers:w.start () def wait_for_complete (self): while Len (self.workers): worker = Self.workers. Pop () # take a thread out of the pool processing request Worker.join () if worker.isalive () and not Self.workQueue.empty (): Self.workers.app  End (worker) # rejoin the thread pool in print ' all jobs were complete. ' def add_job (self, callable, *args, **kwds): Self.workQueue.put ((callable, Args, Kwds) # Add request to Work Queue Def get_result (SE LF, *args, **kwds): Return Self.resultQueue.get (*args, **kwds) def download_file (URL): #print ' beg download ', url requ Ests.get (URL). Textdef main (): try:num_of_threads = Int (sys.argv[1]) Except:num_of_threads = Ten _st = Time.time ( WM = WorkManager (num_of_threads) print num_of_threads urls = [' http://www.baidu.com '] * for i in Urls:wm.ad D_job (Download_file, i) Wm.start () wm.wait_for_complete () print time.time ()-_stif __name__ = = ' __main__ ': Main ()

These three kinds of any kind have very high efficiency, but this run will give the website server not small pressure, especially small site, still a bit moral integrity as well.

  • Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.