Python multithreaded Crawl Proxy Server Example

Source: Internet
Author: User

Python, as a powerful scripting language, is often used to write a crawler, the following is a Python crawler crawl proxy server.
Now crawl http://www.proxy.com.ru proxy Server, where the proxy server more, simpler.
The following code can be used directly, the code is as follows:

#!/usr/bin/env python
#coding: Utf-8
#BLOG: blog.linuxeye.com
Import Urllib2
Import re
Import threading
Import time
Import MySQLdb
Rawproxylist = []
Checkedproxylist = []
#抓取代理网站
Targets = []
For I in Xrange (1,42):
target = r "http://www.proxy.com.ru/list_%d.html"% i
Targets.append (target)
#抓取代理服务器正则
p = re.compile (R ' ' <tr><b><td> (\d+) </td><td> (. +?) </td><td> (\d+) </td><td> (. +?) </td><td> (. +?) </td></b></tr> "")
#获取代理的类
Class Proxyget (threading. Thread):
def __init__ (Self,target):
Threading. Thread.__init__ (self)
Self.target = target
def getproxy (self):
Print proxy server target Web site: + self.target
req = Urllib2.urlopen (self.target)
result = Req.read ()
#print Chardet.detect (Result)
Matchs = P.findall (Result)
# Print Matchs
For row in Matchs:
IP=ROW[1]
Port =row[2]
Addr = Row[4].decode ("cp936"). Encode ("Utf-8")
Proxy = [IP,PORT,ADDR]
Print Proxy
Rawproxylist.append (proxy)
def run (self):
Self.getproxy ()
#检验代理的类
Class Proxycheck (threading. Thread):
def __init__ (self,proxylist):
Threading. Thread.__init__ (self)
Self.proxylist = Proxylist
Self.timeout = 5
Self.testurl = "http://www.baidu.com/"
Self.teststr = "030173"
def checkproxy (self):
cookies = Urllib2. Httpcookieprocessor ()
For proxy in Self.proxylist:
Proxyhandler = Urllib2. Proxyhandler ({"http": R ' http://%s:%s '% (Proxy[0],proxy[1])})
#print R ' http://%s:%s '% (proxy[0],proxy[1])
Opener = Urllib2.build_opener (Cookies,proxyhandler)
Opener.addheaders = [(' User-agent ', ' mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) gecko/20100101 firefox/22.0 ')]
#urllib2. Install_opener (opener)
T1 = Time.time ()
Try
#req = Urllib2.urlopen ("http://www.baidu.com", Timeout=self.timeout)
req = Opener.open (Self.testurl, Timeout=self.timeout)
#print "Urlopen is ok ..."
result = Req.read ()
#print "Read html ..."
timeused = Time.time ()-T1
pos = Result.find (SELF.TESTSTR)
#print "Pos is%s"%pos
If pos > 1:
Checkedproxylist.append ((proxy[0],proxy[1],proxy[2],timeused))
#print ' OK IP:%s%s%s '% (proxy[0],proxy[1],proxy[2],timeused)
Else
Continue
Except Exception,e:
#print E.message
Continue
def run (self):
Self.checkproxy ()
if __name__ = = "__main__":
Getthreads = []
Checkthreads = []
#对每个目标网站开启一个线程负责抓取代理
For I in range (len (targets)):
t = Proxyget (Targets[i])
Getthreads.append (t)
For I in range (len (getthreads)):
Getthreads[i].start ()
For I in range (len (getthreads)):
Getthreads[i].join ()
print '. ' *10+ "A total of%s agent"%len (rawproxylist) + '. ' *10
#开启20个线程负责校验, the crawled agent is divided into 20 parts, each thread verifies one copy
For I in range (20):
t = Proxycheck (rawproxylist[(Len (rawproxylist) +19)/20) * I: (len (rawproxylist) +19)/20) * (i+1)])
Checkthreads.append (t)
For I in range (len (checkthreads)):
Checkthreads[i].start ()
For I in range (len (checkthreads)):
Checkthreads[i].join ()
print '. ' *10+ "There is a total of%s agents through the checksum"%len (checkedproxylist) + '. ' *10
#插入数据库, table structure created itself, four fields ip,port,speed,address
def db_insert (insert_list):
Try
conn = MySQLdb.connect (host= "localhost", user= "root", passwd= "admin", db= "M_common", charset= ' UTF8 ')
cursor = Conn.cursor ()
Cursor.execute (' Delete from proxy ')
Cursor.execute (' ALTER TABLE proxy auto_increment=1 ')
Cursor.executemany ("INSERT into proxy (ip,port,speed,address) VALUES (%s,%s,%s,%s)", insert_list)
Conn.commit ()
Cursor.close ()
Conn.close ()
Except Mysqldb.error,e:
Print "Mysql Error%d:%s"% (E.args[0], e.args[1])

#代理排序持久化

PROXY_OK = []
F= Open ("Proxy_list.txt", ' w+ ')
For proxy in sorted (Checkedproxylist,cmp=lambda x,y:cmp (x[3],y[3)):
If PROXY[3] < 8:
#print "Checked proxy is:%s:%s\t%s\t%s"% (Proxy[0],proxy[1],proxy[2],proxy[3])
Proxy_ok.append ((proxy[0],proxy[1],proxy[3],proxy[2))
F.write ("%s:%s\t%s\t%s\n"% (proxy[0],proxy[1],proxy[2],proxy[3))
F.close ()
Db_insert (PROXY_OK)

The test results are as follows:

# python-v
Python 2.6.6
# python proxy.py
[' 61.58.94.179 ', ' 8088 ', ' \xe5\x8f\xb0\xe6\xb9\xbe\xe7\x9c\x81 \xe5\x8f\xb0\xe6\xb9\xbe\xe5\xae\xbd\xe9\xa2\x91\ Xe9\x80\x9a\xe8\xae\xaf\xe9\xa1\xbe\xe9\x97\xae\xe8\x82\xa1\xe4\xbb\xbd\xe6\x9c\x89\xe9\x99\x90\xe5\x85\xac\ Xe5\x8f\xb8 ']
[' 200.84.116.99 ', ' 9064 ', ' \xe5\xa7\x94\xe5\x86\x85\xe7\x91\x9e\xe6\x8b\x89 ']
[' 183.223.204.8 ', ' 8123 ', ' \xe5\x9b\x9b\xe5\xb7\x9d\xe7\x9c\x81\xe8\x87\xaa\xe8\xb4\xa1\xe5\xb8\x82 \xe7\xa7\xbb\ Xe5\x8a\xa8 ']
.......... A total of 1921 agents were crawled ...
.......... A total of 524 agents through the calibration ....
# more Proxy_list.txt
202.106.169.142:80 Beijing Unicom ADSL 0.291432857513
111.13.136.59:80 Beijing Mobile 0.297957897186
111.13.136.56:80 Beijing Mobile 0.373070955276
111.206.81.248:80 Beijing Unicom 0.403017997742
111.13.136.58:80 Beijing Mobile 0.414332151413
124.202.217.134:8118 Beijing Telecom Tong 0.416817903519
124.202.183.218:8118 Beijing Telecom Tong 0.426618099213
120.132.71.232:80 Beijing Unicom 0.440200090408
61.232.6.164:8081 Beijing railcom 0.469615936279
118.144.96.253:80 Beijing Telecom Tong 0.485229969025
203.192.10.66:80 Beijing Xinhua News agency 0.51485991478
124.202.182.22:8118 Beijing Telecom Tong 0.553130865097
Database:

Mysql> select * from M_common.proxy limit 10;

+----------+-----------------+------+----------+----------------------+---------------------+
| proxy_id | IP | Port | Speed | Address | Create_time |
+----------+-----------------+------+----------+----------------------+---------------------+
| 1 |   202.106.169.142 | 80 | 0.291433 | Beijing Unicom ADSL | 2015-02-26 11:29:24 |
| 2 |   111.13.136.59 | 80 | 0.297958 | Beijing Mobile | 2015-02-26 11:29:24 |
| 3 |   111.13.136.56 | 80 | 0.373071 | Beijing Mobile | 2015-02-26 11:29:24 |
| 4 |   111.206.81.248 | 80 | 0.403018 | Beijing Unicom | 2015-02-26 11:29:24 |
| 5 |   111.13.136.58 | 80 | 0.414332 | Beijing Mobile | 2015-02-26 11:29:24 |
| 6 | 124.202.217.134 | 8118 | 0.416818 | Beijing Telecom Communication | 2015-02-26 11:29:24 |
| 7 | 124.202.183.218 | 8118 | 0.426618 | Beijing Telecom Communication | 2015-02-26 11:29:24 |
| 8 |   120.132.71.232 |   80 | 0.4402 | Beijing Unicom | 2015-02-26 11:29:24 |
| 9 | 61.232.6.164 | 8081 | 0.469616 | Beijing railcom | 2015-02-26 11:29:24 |
| 10 |   118.144.96.253 |  80 | 0.48523 | Beijing Telecom Communication | 2015-02-26 11:29:24 |
+----------+-----------------+------+----------+----------------------+---------------------+
Rows in Set (0.00 sec)

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.