This article mainly introduces PHP and Python implementation of thread pool multi-threaded crawler function, combined with the example of PHP and Python to implement thread pool multithreaded crawler Complete implementation method, the need for friends can refer to the next
Multi-threaded crawler can be used to crawl the content of this can improve performance, here we look at the PHP and Python thread pool multi-threaded crawler example, the code is as follows:
PHP Example
<?phpclass Connect extends Worker//worker mode {public Function __construct () {}public function getconnection () {if (! Self:: $ch) {self:: $ch = Curl_init (); Curl_setopt (self:: $ch, Curlopt_timeout, 2); curl_setopt (self:: $ch, Curlopt_ Returntransfer, 1); Curl_setopt (self:: $ch, Curlopt_header, 0); curl_setopt (self:: $ch, Curlopt_nosignal, true); Curl_ Setopt (self:: $ch, Curlopt_useragent, ' Firefox '); curl_setopt (self:: $ch, Curlopt_followlocation, 1);} /* Do some exception/error stuff this maybe */return self:: $ch;} Public Function CloseConnection () {Curl_close (self:: $ch);} /*** Note that the link was stored statically, which for pthreads, means thread local* */protected static $ch;} Class Query extends Threaded{public function __construct ($url) {$this->url = $url;} Public Function Run () {$ch = $this->worker->getconnection (); curl_setopt ($ch, Curlopt_url, $this->url); $page = Curl_exec ($ch); $info = Curl_getinfo ($ch); $error = Curl_error ($ch); $this->deal_data ($this->url, $page, $info, $ Error); $this->result = $page;} function Deal_data ($url, $page, $info, $error) {$parts = Explode (".", $url); $id = $parts [1];if ($info [' http_code ']! = 200) { $this->show_msg ($id, $error);} else{$this->show_msg ($id, "OK");}} function show_msg ($id, $msg) {echo $id. " \t$msg\n ";} Public Function GetResult () {return $this->result;} protected $url;p rotected $result;} function Check_urls_multi_pthreads () {global $check _urls;//define crawl Connection $check_urls = Array (' http://xxx.com ' = ' xx net '); $pool = new Pool ("Connect", Array ()); Create 10 thread pool foreach ($check _urls as $url + = $name) {$pool->submit (new Query ($url));} $pool->shutdown ();} Check_urls_multi_pthreads ();p Ython multithreaded def handle (SID)://This method performs crawler data processing Passclass MyThread (Thread): "" "DocString for ClassName "" "Def __init__ (self, sid): Thread.__init__ (self) self.sid = Siddef run (): Handle (SELF.SID) threads = []for i in Xrange (1,11): t = MyThread (i) threads.append (t) T.start () for T in Threads:t.join ()
Python thread pool crawler:
From queue import queuefrom threading import Thread, Lockimport urllib.parseimport socketimport reimport timeseen_urls = s ET (['/']) lock = Lock () class Fetcher (Thread): Def __init__ (self, Tasks): thread.__init__ (self) self.tasks = tasks Self.daemon = True Self.start () def run (self): while True:url = Self.tasks.get () print (URL) sock = Socket.socket () sock.connect ((' localhost ', +)) get = ' Get {} http/1.0\r\nhost:localhost\r\n\r\n '. Format (URL ) Sock.send (Get.encode (' ASCII ')) response = B ' chunk = SOCK.RECV (4096) while Chunk:response + = Chunk Chunk = Sock.recv (4096) links = self.parse_links (URL, Response) lock.acquire () for link in L Inks.difference (Seen_urls): Self.tasks.put (link) seen_urls.update (links) lock.release () self.tasks.t Ask_done () def parse_links (self, Fetched_url, response): If not Response:print (' ERROR: {} '. Format (Fetched_url)) return set () ifNot self._is_html (response): Return set () URLs = set (Re.findall r "(? i) href=[" ']? ( [^\s "' <>]+) ', Self.body (response))) links = set () for URLs in urls:normalized = URLLIB.PA Rse.urljoin (Fetched_url, url) parts = urllib.parse.urlparse (normalized) if parts.scheme not in (' ', ' http ', ' htt PS '): Continue host, port = Urllib.parse.splitport (Parts.netloc) if host and Host.lower () not in (' Localh OST '): Continue defragmented, Frag = Urllib.parse.urldefrag (Parts.path) Links.add (defragmented) return Links def body (self, response): BODY = Response.split (b ' \r\n\r\n ', 1) [1] return Body.decode (' Utf-8 ') def _is_html (Self, Response): Head, BODY = Response.split (b ' \r\n\r\n ', 1) headers = dict (H.split (': ') to H in Head.decode (). SP Lit (' \ r \ n ') [1:]) return Headers.get (' Content-type ', '). StartsWith (' text/html ') class Threadpool:def __init__ (self, n Um_threads): Self.tasks = Queue () for _ in range (nUm_threads): Fetcher (self.tasks) def add_task (self, URL): Self.tasks.put (URL) def-wait_completion (self): self . Tasks.join () if __name__ = = ' __main__ ': start = time.time () pool = ThreadPool (4) Pool.add_task ("/") Pool.wait_completi On () print (' {} URLs fetched in {:. 1f} seconds '. Format (len (seen_urls), Time.time ()-start))
Summary: The above is the entire content of this article, I hope to be able to help you learn.