Example of thread pool multi-thread crawler implemented by php and python, python Crawler

Last Update:2016-10-20 Source: Internet

Author: User

Tags php example

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Example of thread pool multi-thread crawler implemented by php and python, python Crawler

This example describes the thread pool multi-thread crawling function implemented by php and python. We will share this with you for your reference. The details are as follows:

Multi-thread crawler can be used to capture content, which can improve performance. Here we look at the example of multi-thread crawler in php and python thread pools. The Code is as follows:

Php example

<? Phpclass Connect extends Worker // worker mode {public function _ construct () {} public function getConnection () {if (! Self: $ ch) {self: $ ch = curl_init (); curl_setopt (self: $ ch, CURLOPT_TIMEOUT, 2); curl_setopt (self: $ ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt (self: $ ch, CURLOPT_HEADER, 0); curl_setopt (self: $ ch, CURLOPT_NOSIGNAL, true); curl_setopt (self: $ ch, CURLOPT_USERAGENT, "Firefox"); curl_setopt (self: $ ch, CURLOPT_FOLLOWLOCATION, 1);}/* do some exception/error stuff here maybe */return self: $ ch ;} public function CloseConnection () {curl_close (self: $ ch);}/*** Note that the link is stored statically, which for pthreads, means thread local **/protected static $ ch;} class Query extends Threaded {public function _ construct ($ url) {$ this-> url = $ url ;} public function run () {$ ch = $ this-> worker-> getConnection (); curl_setopt ($ ch, CURLOPT_URL, $ this-> url ); $ page = curl_exec ($ ch); $ info = curl_getinfo ($ ch); $ error = curl_err Or ($ ch); $ this-> deal_data ($ this-> url, $ page, $ info, $ error); $ this-> result = $ page ;} function deal_data ($ url, $ page, $ info, $ error) {$ parts = explode (". ", $ url); $ id = $ parts [1]; if ($ info ['HTTP _ Code']! = 200) {$ this-> show_msg ($ id, $ error);} else {$ this-> show_msg ($ id, "OK ");}} function show_msg ($ id, $ msg) {echo $ id. "\ t $ msg \ n";} public function getResult () {return $ this-> result;} protected $ url; protected $ result;} function check_urls_multi_pthreads () {global $ check_urls; // defines the captured connection $ check_urls = array ('HTTP: // xxx.com '=> "xx network",); $ pool = new Pool (10, "Connect", array (); // create 10 thread pools foreach ($ check_urls as $ url => $ name) {$ pool-> submit (new Query ($ url);} $ pool-> shutdown ();} check_urls_multi_pthreads (); python multi-thread def handle (sid ): // execute crawler data processing in this method. passclass MyThread (Thread): "docstring for ClassName" "def _ init _ (self, sid): Thread. _ init _ (self) self. sid = siddef run (): handle (self. sid) threads = [] for I in xrange (1, 11): t = MyThread (I) threads. append (t) t. start () for t in threads: t. join ()

Python thread pool crawler:

from queue import Queuefrom threading import Thread, Lockimport urllib.parseimport socketimport reimport timeseen_urls = set(['/'])lock = Lock()class Fetcher(Thread):  def __init__(self, tasks):    Thread.__init__(self)    self.tasks = tasks    self.daemon = True    self.start()  def run(self):    while True:      url = self.tasks.get()      print(url)      sock = socket.socket()      sock.connect(('localhost', 3000))      get = 'GET {} HTTP/1.0\r\nHost: localhost\r\n\r\n'.format(url)      sock.send(get.encode('ascii'))      response = b''      chunk = sock.recv(4096)      while chunk:        response += chunk        chunk = sock.recv(4096)      links = self.parse_links(url, response)      lock.acquire()      for link in links.difference(seen_urls):        self.tasks.put(link)      seen_urls.update(links)      lock.release()      self.tasks.task_done()  def parse_links(self, fetched_url, response):    if not response:      print('error: {}'.format(fetched_url))      return set()    if not self._is_html(response):      return set()    urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''',               self.body(response)))    links = set()    for url in urls:      normalized = urllib.parse.urljoin(fetched_url, url)      parts = urllib.parse.urlparse(normalized)      if parts.scheme not in ('', 'http', 'https'):        continue      host, port = urllib.parse.splitport(parts.netloc)      if host and host.lower() not in ('localhost'):        continue      defragmented, frag = urllib.parse.urldefrag(parts.path)      links.add(defragmented)    return links  def body(self, response):    body = response.split(b'\r\n\r\n', 1)[1]    return body.decode('utf-8')  def _is_html(self, response):    head, body = response.split(b'\r\n\r\n', 1)    headers = dict(h.split(': ') for h in head.decode().split('\r\n')[1:])    return headers.get('Content-Type', '').startswith('text/html')class ThreadPool:  def __init__(self, num_threads):    self.tasks = Queue()    for _ in range(num_threads):      Fetcher(self.tasks)  def add_task(self, url):    self.tasks.put(url)  def wait_completion(self):    self.tasks.join()if __name__ == '__main__':  start = time.time()  pool = ThreadPool(4)  pool.add_task("/")  pool.wait_completion()  print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More