PHP and Python thread pool multithreaded crawler example

Source: Internet
Author: User
Tags join php and php example socket

PHP Example

<?php

Class Connect extends Worker//worker mode
{

Public Function __construct ()
{

}

Public Function getconnection ()
{
if (!self:: $ch)
{
Self:: $ch = Curl_init ();
curl_setopt (self:: $ch, Curlopt_timeout, 2);
curl_setopt (self:: $ch, Curlopt_returntransfer, 1);
curl_setopt (self:: $ch, Curlopt_header, 0);
curl_setopt (self:: $ch, Curlopt_nosignal, true);
curl_setopt (self:: $ch, Curlopt_useragent, "Firefox");
curl_setopt (self:: $ch, Curlopt_followlocation, 1);
}

/* Do some exception/error stuff maybe * *

Return self:: $ch;
}

Public Function CloseConnection ()
{
Curl_close (self:: $ch);
}

/**
* The link is stored statically, which to pthreads, means thread local
* */
protected static $ch;

}

Class Query extends Threaded
{

Public function __construct ($url)
{
$this->url = $url;
}

Public Function Run ()
{
$ch = $this->worker->getconnection ();
curl_setopt ($ch, Curlopt_url, $this->url);
$page = curl_exec ($ch);
$info = Curl_getinfo ($ch);
$error = Curl_error ($ch);
$this->deal_data ($this->url, $page, $info, $error);

$this->result = $page;
}

function Deal_data ($url, $page, $info, $error)
{
$parts = Explode (".", $url);

$id = $parts [1];
if ($info [' Http_code ']!= 200)
{
$this->show_msg ($id, $error);
} else
{
$this->show_msg ($id, "OK");
}
}

function show_msg ($id, $msg)
{
echo $id. " \t$msg\n ";
}

Public Function GetResult ()
{
return $this->result;
}

protected $url;
protected $result;

}

function Check_urls_multi_pthreads ()
{
Global $check _urls; Defining a connection for crawling
$check _urls = Array (' http://xxx.com ' => "xx net");
$pool = new Pool ("Connect", Array ()); Create 10 Thread Pools
foreach ($check _urls as $url => $name)
{
$pool->submit (New Query ($url));
}
$pool->shutdown ();
}

Check_urls_multi_pthreads ();


Python multithreading


def handle (SID)://Execute crawler Data processing within this method
Pass
Class Mythread (Thread):
"" "DocString for ClassName" ""
def __init__ (self, sid):
Thread.__init__ (self)
Self.sid = Sid

def run ():
Handle (SELF.SID)

Threads = []
For I in Xrange (1,11):
t = mythread (i)
Threads.append (t)
T.start ()

For T in Threads:
T.join ()


Python thread pool crawler


From queue import queue
From threading import Thread, Lock
Import Urllib.parse
Import socket
Import re
Import time

Seen_urls = Set (['/'])
Lock = Lock ()


Class Fetcher (Thread):
def __init__ (self, tasks):
Thread.__init__ (self)
Self.tasks = Tasks
Self.daemon = True

Self.start ()

def run (self):
While True:
url = self.tasks.get ()
Print (URL)
Sock = Socket.socket ()
Sock.connect ((' localhost ', 3000))
Get = ' Get {} http/1.0\r\nhost:localhost\r\n\r\n '. Format (URL)
Sock.send (Get.encode (' ASCII '))
Response = B ""
Chunk = SOCK.RECV (4096)
While Chunk:
Response + + Chunk
Chunk = SOCK.RECV (4096)

Links = self.parse_links (URL, response)

Lock.acquire ()
For link in links.difference (seen_urls):
Self.tasks.put (link)
Seen_urls.update (links)
Lock.release ()

Self.tasks.task_done ()

def parse_links (self, Fetched_url, response):
If not response:
Print (' ERROR: {} '. Format (Fetched_url))
return set ()
If not self._is_html (response):
return set ()
URLs = set (Re.findall (? i) href=["']? ( [^\s "' <>]+] ',
Self.body (response))

Links = set ()
For URL in URLs:
normalized = Urllib.parse.urljoin (fetched_url, URL)
Parts = Urllib.parse.urlparse (normalized)
If Parts.scheme not in (', ' http ', ' HTTPS '):
Continue
Host, Port = Urllib.parse.splitport (Parts.netloc)
If host and Host.lower () not in (' localhost '):
Continue
defragmented, Frag = Urllib.parse.urldefrag (Parts.path)
Links.add (defragmented)

Return links

def body (self, Response):
BODY = Response.split (b ' \r\n\r\n ', 1) [1]
Return Body.decode (' Utf-8 ')

def _is_html (self, Response):
Head, BODY = Response.split (b ' \r\n\r\n ', 1)
headers = Dict (H.split (': ') for h in Head.decode (). Split (' \ r \ n ') [1:])
Return Headers.get (' Content-type ', '). StartsWith (' text/html ')


Class ThreadPool:
def __init__ (self, num_threads):
Self.tasks = Queue ()
For _ in range (num_threads):
Fetcher (Self.tasks)

def add_task (self, URL):
Self.tasks.put (URL)

def wait_completion (self):
Self.tasks.join ()

if __name__ = = ' __main__ ':
Start = Time.time ()
Pool = ThreadPool (4)
Pool.add_task ("/")
Pool.wait_completion ()
Print (' {} URLs fetched in {:. 1f} seconds '. Format (len (seen_urls), Time.time ()-start)

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.