PHP Example
<?php
Class Connect extends Worker//worker mode
{
Public Function __construct ()
{
}
Public Function getconnection ()
{
if (!self:: $ch)
{
Self:: $ch = Curl_init ();
curl_setopt (self:: $ch, Curlopt_timeout, 2);
curl_setopt (self:: $ch, Curlopt_returntransfer, 1);
curl_setopt (self:: $ch, Curlopt_header, 0);
curl_setopt (self:: $ch, Curlopt_nosignal, true);
curl_setopt (self:: $ch, Curlopt_useragent, "Firefox");
curl_setopt (self:: $ch, Curlopt_followlocation, 1);
}
/* Do some exception/error stuff maybe * *
Return self:: $ch;
}
Public Function CloseConnection ()
{
Curl_close (self:: $ch);
}
/**
* The link is stored statically, which to pthreads, means thread local
* */
protected static $ch;
}
Class Query extends Threaded
{
Public function __construct ($url)
{
$this->url = $url;
}
Public Function Run ()
{
$ch = $this->worker->getconnection ();
curl_setopt ($ch, Curlopt_url, $this->url);
$page = curl_exec ($ch);
$info = Curl_getinfo ($ch);
$error = Curl_error ($ch);
$this->deal_data ($this->url, $page, $info, $error);
$this->result = $page;
}
function Deal_data ($url, $page, $info, $error)
{
$parts = Explode (".", $url);
$id = $parts [1];
if ($info [' Http_code ']!= 200)
{
$this->show_msg ($id, $error);
} else
{
$this->show_msg ($id, "OK");
}
}
function show_msg ($id, $msg)
{
echo $id. " \t$msg\n ";
}
Public Function GetResult ()
{
return $this->result;
}
protected $url;
protected $result;
}
function Check_urls_multi_pthreads ()
{
Global $check _urls; Defining a connection for crawling
$check _urls = Array (' http://xxx.com ' => "xx net");
$pool = new Pool ("Connect", Array ()); Create 10 Thread Pools
foreach ($check _urls as $url => $name)
{
$pool->submit (New Query ($url));
}
$pool->shutdown ();
}
Check_urls_multi_pthreads ();
Python multithreading
def handle (SID)://Execute crawler Data processing within this method
Pass
Class Mythread (Thread):
"" "DocString for ClassName" ""
def __init__ (self, sid):
Thread.__init__ (self)
Self.sid = Sid
def run ():
Handle (SELF.SID)
Threads = []
For I in Xrange (1,11):
t = mythread (i)
Threads.append (t)
T.start ()
For T in Threads:
T.join ()
Python thread pool crawler
From queue import queue
From threading import Thread, Lock
Import Urllib.parse
Import socket
Import re
Import time
Seen_urls = Set (['/'])
Lock = Lock ()
Class Fetcher (Thread):
def __init__ (self, tasks):
Thread.__init__ (self)
Self.tasks = Tasks
Self.daemon = True
Self.start ()
def run (self):
While True:
url = self.tasks.get ()
Print (URL)
Sock = Socket.socket ()
Sock.connect ((' localhost ', 3000))
Get = ' Get {} http/1.0\r\nhost:localhost\r\n\r\n '. Format (URL)
Sock.send (Get.encode (' ASCII '))
Response = B ""
Chunk = SOCK.RECV (4096)
While Chunk:
Response + + Chunk
Chunk = SOCK.RECV (4096)
Links = self.parse_links (URL, response)
Lock.acquire ()
For link in links.difference (seen_urls):
Self.tasks.put (link)
Seen_urls.update (links)
Lock.release ()
Self.tasks.task_done ()
def parse_links (self, Fetched_url, response):
If not response:
Print (' ERROR: {} '. Format (Fetched_url))
return set ()
If not self._is_html (response):
return set ()
URLs = set (Re.findall (? i) href=["']? ( [^\s "' <>]+] ',
Self.body (response))
Links = set ()
For URL in URLs:
normalized = Urllib.parse.urljoin (fetched_url, URL)
Parts = Urllib.parse.urlparse (normalized)
If Parts.scheme not in (', ' http ', ' HTTPS '):
Continue
Host, Port = Urllib.parse.splitport (Parts.netloc)
If host and Host.lower () not in (' localhost '):
Continue
defragmented, Frag = Urllib.parse.urldefrag (Parts.path)
Links.add (defragmented)
Return links
def body (self, Response):
BODY = Response.split (b ' \r\n\r\n ', 1) [1]
Return Body.decode (' Utf-8 ')
def _is_html (self, Response):
Head, BODY = Response.split (b ' \r\n\r\n ', 1)
headers = Dict (H.split (': ') for h in Head.decode (). Split (' \ r \ n ') [1:])
Return Headers.get (' Content-type ', '). StartsWith (' text/html ')
Class ThreadPool:
def __init__ (self, num_threads):
Self.tasks = Queue ()
For _ in range (num_threads):
Fetcher (Self.tasks)
def add_task (self, URL):
Self.tasks.put (URL)
def wait_completion (self):
Self.tasks.join ()
if __name__ = = ' __main__ ':
Start = Time.time ()
Pool = ThreadPool (4)
Pool.add_task ("/")
Pool.wait_completion ()
Print (' {} URLs fetched in {:. 1f} seconds '. Format (len (seen_urls), Time.time ()-start)