PHP and Python thread pool multithreaded crawler example

Last Update:2017-01-13 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

PHP Example

<?php

Class Connect extends Worker//worker mode
{

Public Function __construct ()
{

}

Public Function getconnection ()
{
if (!self:: $ch)
{
Self:: $ch = Curl_init ();
curl_setopt (self:: $ch, Curlopt_timeout, 2);
curl_setopt (self:: $ch, Curlopt_returntransfer, 1);
curl_setopt (self:: $ch, Curlopt_header, 0);
curl_setopt (self:: $ch, Curlopt_nosignal, true);
curl_setopt (self:: $ch, Curlopt_useragent, "Firefox");
curl_setopt (self:: $ch, Curlopt_followlocation, 1);
}

/* Do some exception/error stuff maybe * *

Return self:: $ch;
}

Public Function CloseConnection ()
{
Curl_close (self:: $ch);
}

/**
* The link is stored statically, which to pthreads, means thread local
* */
protected static $ch;

}

Class Query extends Threaded
{

Public function __construct ($url)
{
$this->url = $url;
}

Public Function Run ()
{
$ch = $this->worker->getconnection ();
curl_setopt ($ch, Curlopt_url, $this->url);
$page = curl_exec ($ch);
$info = Curl_getinfo ($ch);
$error = Curl_error ($ch);
$this->deal_data ($this->url, $page, $info, $error);

$this->result = $page;
}

function Deal_data ($url, $page, $info, $error)
{
$parts = Explode (".", $url);

$id = $parts [1];
if ($info [' Http_code ']!= 200)
{
$this->show_msg ($id, $error);
} else
{
$this->show_msg ($id, "OK");
}
}

function show_msg ($id, $msg)
{
echo $id. " \t$msg\n ";
}

Public Function GetResult ()
{
return $this->result;
}

protected $url;
protected $result;

}

function Check_urls_multi_pthreads ()
{
Global $check _urls; Defining a connection for crawling
$check _urls = Array (' http://xxx.com ' => "xx net");
$pool = new Pool ("Connect", Array ()); Create 10 Thread Pools
foreach ($check _urls as $url => $name)
{
$pool->submit (New Query ($url));
}
$pool->shutdown ();
}

Check_urls_multi_pthreads ();

Python multithreading

def handle (SID)://Execute crawler Data processing within this method
Pass
Class Mythread (Thread):
"" "DocString for ClassName" ""
def __init__ (self, sid):
Thread.__init__ (self)
Self.sid = Sid

def run ():
Handle (SELF.SID)

Threads = []
For I in Xrange (1,11):
t = mythread (i)
Threads.append (t)
T.start ()

For T in Threads:
T.join ()

Python thread pool crawler

From queue import queue
From threading import Thread, Lock
Import Urllib.parse
Import socket
Import re
Import time

Seen_urls = Set (['/'])
Lock = Lock ()

Class Fetcher (Thread):
def __init__ (self, tasks):
Thread.__init__ (self)
Self.tasks = Tasks
Self.daemon = True

Self.start ()

def run (self):
While True:
url = self.tasks.get ()
Print (URL)
Sock = Socket.socket ()
Sock.connect ((' localhost ', 3000))
Get = ' Get {} http/1.0\r\nhost:localhost\r\n\r\n '. Format (URL)
Sock.send (Get.encode (' ASCII '))
Response = B ""
Chunk = SOCK.RECV (4096)
While Chunk:
Response + + Chunk
Chunk = SOCK.RECV (4096)

Links = self.parse_links (URL, response)

Lock.acquire ()
For link in links.difference (seen_urls):
Self.tasks.put (link)
Seen_urls.update (links)
Lock.release ()

Self.tasks.task_done ()

def parse_links (self, Fetched_url, response):
If not response:
Print (' ERROR: {} '. Format (Fetched_url))
return set ()
If not self._is_html (response):
return set ()
URLs = set (Re.findall (? i) href=["']? ( [^\s "' <>]+] ',
Self.body (response))

Links = set ()
For URL in URLs:
normalized = Urllib.parse.urljoin (fetched_url, URL)
Parts = Urllib.parse.urlparse (normalized)
If Parts.scheme not in (', ' http ', ' HTTPS '):
Continue
Host, Port = Urllib.parse.splitport (Parts.netloc)
If host and Host.lower () not in (' localhost '):
Continue
defragmented, Frag = Urllib.parse.urldefrag (Parts.path)
Links.add (defragmented)

Return links

def body (self, Response):
BODY = Response.split (b ' \r\n\r\n ', 1) [1]
Return Body.decode (' Utf-8 ')

def _is_html (self, Response):
Head, BODY = Response.split (b ' \r\n\r\n ', 1)
headers = Dict (H.split (': ') for h in Head.decode (). Split (' \ r \ n ') [1:])
Return Headers.get (' Content-type ', '). StartsWith (' text/html ')

Class ThreadPool:
def __init__ (self, num_threads):
Self.tasks = Queue ()
For _ in range (num_threads):
Fetcher (Self.tasks)

def add_task (self, URL):
Self.tasks.put (URL)

def wait_completion (self):
Self.tasks.join ()

if __name__ = = ' __main__ ':
Start = Time.time ()
Pool = ThreadPool (4)
Pool.add_task ("/")
Pool.wait_completion ()
Print (' {} URLs fetched in {:. 1f} seconds '. Format (len (seen_urls), Time.time ()-start)

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

PHP and Python thread pool multithreaded crawler example

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

PHP and Python thread pool multithreaded crawler example

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support