Example of thread pool multi-thread crawler implemented by php and python, python Crawler
This example describes the thread pool multi-thread crawling function implemented by php and python. We will share this with you for your reference. The details are as follows:
Multi-thread crawler can be used to capture content, which can improve performance. Here we look at the example of multi-thread crawler in php and python thread pools. The Code is as follows:
Php example
<? Phpclass Connect extends Worker // worker mode {public function _ construct () {} public function getConnection () {if (! Self: $ ch) {self: $ ch = curl_init (); curl_setopt (self: $ ch, CURLOPT_TIMEOUT, 2); curl_setopt (self: $ ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt (self: $ ch, CURLOPT_HEADER, 0); curl_setopt (self: $ ch, CURLOPT_NOSIGNAL, true); curl_setopt (self: $ ch, CURLOPT_USERAGENT, "Firefox"); curl_setopt (self: $ ch, CURLOPT_FOLLOWLOCATION, 1);}/* do some exception/error stuff here maybe */return self: $ ch ;} public function CloseConnection () {curl_close (self: $ ch);}/*** Note that the link is stored statically, which for pthreads, means thread local **/protected static $ ch;} class Query extends Threaded {public function _ construct ($ url) {$ this-> url = $ url ;} public function run () {$ ch = $ this-> worker-> getConnection (); curl_setopt ($ ch, CURLOPT_URL, $ this-> url ); $ page = curl_exec ($ ch); $ info = curl_getinfo ($ ch); $ error = curl_err Or ($ ch); $ this-> deal_data ($ this-> url, $ page, $ info, $ error); $ this-> result = $ page ;} function deal_data ($ url, $ page, $ info, $ error) {$ parts = explode (". ", $ url); $ id = $ parts [1]; if ($ info ['HTTP _ Code']! = 200) {$ this-> show_msg ($ id, $ error);} else {$ this-> show_msg ($ id, "OK ");}} function show_msg ($ id, $ msg) {echo $ id. "\ t $ msg \ n";} public function getResult () {return $ this-> result;} protected $ url; protected $ result;} function check_urls_multi_pthreads () {global $ check_urls; // defines the captured connection $ check_urls = array ('HTTP: // xxx.com '=> "xx network",); $ pool = new Pool (10, "Connect", array (); // create 10 thread pools foreach ($ check_urls as $ url => $ name) {$ pool-> submit (new Query ($ url);} $ pool-> shutdown ();} check_urls_multi_pthreads (); python multi-thread def handle (sid ): // execute crawler data processing in this method. passclass MyThread (Thread): "docstring for ClassName" "def _ init _ (self, sid): Thread. _ init _ (self) self. sid = siddef run (): handle (self. sid) threads = [] for I in xrange (1, 11): t = MyThread (I) threads. append (t) t. start () for t in threads: t. join ()
Python thread pool crawler:
from queue import Queuefrom threading import Thread, Lockimport urllib.parseimport socketimport reimport timeseen_urls = set(['/'])lock = Lock()class Fetcher(Thread): def __init__(self, tasks): Thread.__init__(self) self.tasks = tasks self.daemon = True self.start() def run(self): while True: url = self.tasks.get() print(url) sock = socket.socket() sock.connect(('localhost', 3000)) get = 'GET {} HTTP/1.0\r\nHost: localhost\r\n\r\n'.format(url) sock.send(get.encode('ascii')) response = b'' chunk = sock.recv(4096) while chunk: response += chunk chunk = sock.recv(4096) links = self.parse_links(url, response) lock.acquire() for link in links.difference(seen_urls): self.tasks.put(link) seen_urls.update(links) lock.release() self.tasks.task_done() def parse_links(self, fetched_url, response): if not response: print('error: {}'.format(fetched_url)) return set() if not self._is_html(response): return set() urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''', self.body(response))) links = set() for url in urls: normalized = urllib.parse.urljoin(fetched_url, url) parts = urllib.parse.urlparse(normalized) if parts.scheme not in ('', 'http', 'https'): continue host, port = urllib.parse.splitport(parts.netloc) if host and host.lower() not in ('localhost'): continue defragmented, frag = urllib.parse.urldefrag(parts.path) links.add(defragmented) return links def body(self, response): body = response.split(b'\r\n\r\n', 1)[1] return body.decode('utf-8') def _is_html(self, response): head, body = response.split(b'\r\n\r\n', 1) headers = dict(h.split(': ') for h in head.decode().split('\r\n')[1:]) return headers.get('Content-Type', '').startswith('text/html')class ThreadPool: def __init__(self, num_threads): self.tasks = Queue() for _ in range(num_threads): Fetcher(self.tasks) def add_task(self, url): self.tasks.put(url) def wait_completion(self): self.tasks.join()if __name__ == '__main__': start = time.time() pool = ThreadPool(4) pool.add_task("/") pool.wait_completion() print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))