Example of thread pool multi-thread crawler implemented by php and python, python Crawler

Source: Internet
Author: User
Tags php example

Example of thread pool multi-thread crawler implemented by php and python, python Crawler

This example describes the thread pool multi-thread crawling function implemented by php and python. We will share this with you for your reference. The details are as follows:

Multi-thread crawler can be used to capture content, which can improve performance. Here we look at the example of multi-thread crawler in php and python thread pools. The Code is as follows:

Php example

<? Phpclass Connect extends Worker // worker mode {public function _ construct () {} public function getConnection () {if (! Self: $ ch) {self: $ ch = curl_init (); curl_setopt (self: $ ch, CURLOPT_TIMEOUT, 2); curl_setopt (self: $ ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt (self: $ ch, CURLOPT_HEADER, 0); curl_setopt (self: $ ch, CURLOPT_NOSIGNAL, true); curl_setopt (self: $ ch, CURLOPT_USERAGENT, "Firefox"); curl_setopt (self: $ ch, CURLOPT_FOLLOWLOCATION, 1);}/* do some exception/error stuff here maybe */return self: $ ch ;} public function CloseConnection () {curl_close (self: $ ch);}/*** Note that the link is stored statically, which for pthreads, means thread local **/protected static $ ch;} class Query extends Threaded {public function _ construct ($ url) {$ this-> url = $ url ;} public function run () {$ ch = $ this-> worker-> getConnection (); curl_setopt ($ ch, CURLOPT_URL, $ this-> url ); $ page = curl_exec ($ ch); $ info = curl_getinfo ($ ch); $ error = curl_err Or ($ ch); $ this-> deal_data ($ this-> url, $ page, $ info, $ error); $ this-> result = $ page ;} function deal_data ($ url, $ page, $ info, $ error) {$ parts = explode (". ", $ url); $ id = $ parts [1]; if ($ info ['HTTP _ Code']! = 200) {$ this-> show_msg ($ id, $ error);} else {$ this-> show_msg ($ id, "OK ");}} function show_msg ($ id, $ msg) {echo $ id. "\ t $ msg \ n";} public function getResult () {return $ this-> result;} protected $ url; protected $ result;} function check_urls_multi_pthreads () {global $ check_urls; // defines the captured connection $ check_urls = array ('HTTP: // xxx.com '=> "xx network",); $ pool = new Pool (10, "Connect", array (); // create 10 thread pools foreach ($ check_urls as $ url => $ name) {$ pool-> submit (new Query ($ url);} $ pool-> shutdown ();} check_urls_multi_pthreads (); python multi-thread def handle (sid ): // execute crawler data processing in this method. passclass MyThread (Thread): "docstring for ClassName" "def _ init _ (self, sid): Thread. _ init _ (self) self. sid = siddef run (): handle (self. sid) threads = [] for I in xrange (1, 11): t = MyThread (I) threads. append (t) t. start () for t in threads: t. join ()

Python thread pool crawler:

from queue import Queuefrom threading import Thread, Lockimport urllib.parseimport socketimport reimport timeseen_urls = set(['/'])lock = Lock()class Fetcher(Thread):  def __init__(self, tasks):    Thread.__init__(self)    self.tasks = tasks    self.daemon = True    self.start()  def run(self):    while True:      url = self.tasks.get()      print(url)      sock = socket.socket()      sock.connect(('localhost', 3000))      get = 'GET {} HTTP/1.0\r\nHost: localhost\r\n\r\n'.format(url)      sock.send(get.encode('ascii'))      response = b''      chunk = sock.recv(4096)      while chunk:        response += chunk        chunk = sock.recv(4096)      links = self.parse_links(url, response)      lock.acquire()      for link in links.difference(seen_urls):        self.tasks.put(link)      seen_urls.update(links)      lock.release()      self.tasks.task_done()  def parse_links(self, fetched_url, response):    if not response:      print('error: {}'.format(fetched_url))      return set()    if not self._is_html(response):      return set()    urls = set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''',               self.body(response)))    links = set()    for url in urls:      normalized = urllib.parse.urljoin(fetched_url, url)      parts = urllib.parse.urlparse(normalized)      if parts.scheme not in ('', 'http', 'https'):        continue      host, port = urllib.parse.splitport(parts.netloc)      if host and host.lower() not in ('localhost'):        continue      defragmented, frag = urllib.parse.urldefrag(parts.path)      links.add(defragmented)    return links  def body(self, response):    body = response.split(b'\r\n\r\n', 1)[1]    return body.decode('utf-8')  def _is_html(self, response):    head, body = response.split(b'\r\n\r\n', 1)    headers = dict(h.split(': ') for h in head.decode().split('\r\n')[1:])    return headers.get('Content-Type', '').startswith('text/html')class ThreadPool:  def __init__(self, num_threads):    self.tasks = Queue()    for _ in range(num_threads):      Fetcher(self.tasks)  def add_task(self, url):    self.tasks.put(url)  def wait_completion(self):    self.tasks.join()if __name__ == '__main__':  start = time.time()  pool = ThreadPool(4)  pool.add_task("/")  pool.wait_completion()  print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.