Python, as a powerful scripting language, is often used to write a crawler, the following is a Python crawler crawl proxy server.
Now crawl http://www.proxy.com.ru proxy Server, where the proxy server more, simpler.
The following code can be used directly, the code is as follows:
#!/usr/bin/env python
#coding: Utf-8
#BLOG: blog.linuxeye.com
Import Urllib2
Import re
Import threading
Import time
Import MySQLdb
Rawproxylist = []
Checkedproxylist = []
#抓取代理网站
Targets = []
For I in Xrange (1,42):
target = r "http://www.proxy.com.ru/list_%d.html"% i
Targets.append (target)
#抓取代理服务器正则
p = re.compile (R ' ' <tr><b><td> (\d+) </td><td> (. +?) </td><td> (\d+) </td><td> (. +?) </td><td> (. +?) </td></b></tr> "")
#获取代理的类
Class Proxyget (threading. Thread):
def __init__ (Self,target):
Threading. Thread.__init__ (self)
Self.target = target
def getproxy (self):
Print proxy server target Web site: + self.target
req = Urllib2.urlopen (self.target)
result = Req.read ()
#print Chardet.detect (Result)
Matchs = P.findall (Result)
# Print Matchs
For row in Matchs:
IP=ROW[1]
Port =row[2]
Addr = Row[4].decode ("cp936"). Encode ("Utf-8")
Proxy = [IP,PORT,ADDR]
Print Proxy
Rawproxylist.append (proxy)
def run (self):
Self.getproxy ()
#检验代理的类
Class Proxycheck (threading. Thread):
def __init__ (self,proxylist):
Threading. Thread.__init__ (self)
Self.proxylist = Proxylist
Self.timeout = 5
Self.testurl = "http://www.baidu.com/"
Self.teststr = "030173"
def checkproxy (self):
cookies = Urllib2. Httpcookieprocessor ()
For proxy in Self.proxylist:
Proxyhandler = Urllib2. Proxyhandler ({"http": R ' http://%s:%s '% (Proxy[0],proxy[1])})
#print R ' http://%s:%s '% (proxy[0],proxy[1])
Opener = Urllib2.build_opener (Cookies,proxyhandler)
Opener.addheaders = [(' User-agent ', ' mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) gecko/20100101 firefox/22.0 ')]
#urllib2. Install_opener (opener)
T1 = Time.time ()
Try
#req = Urllib2.urlopen ("http://www.baidu.com", Timeout=self.timeout)
req = Opener.open (Self.testurl, Timeout=self.timeout)
#print "Urlopen is ok ..."
result = Req.read ()
#print "Read html ..."
timeused = Time.time ()-T1
pos = Result.find (SELF.TESTSTR)
#print "Pos is%s"%pos
If pos > 1:
Checkedproxylist.append ((proxy[0],proxy[1],proxy[2],timeused))
#print ' OK IP:%s%s%s '% (proxy[0],proxy[1],proxy[2],timeused)
Else
Continue
Except Exception,e:
#print E.message
Continue
def run (self):
Self.checkproxy ()
if __name__ = = "__main__":
Getthreads = []
Checkthreads = []
#对每个目标网站开启一个线程负责抓取代理
For I in range (len (targets)):
t = Proxyget (Targets[i])
Getthreads.append (t)
For I in range (len (getthreads)):
Getthreads[i].start ()
For I in range (len (getthreads)):
Getthreads[i].join ()
print '. ' *10+ "A total of%s agent"%len (rawproxylist) + '. ' *10
#开启20个线程负责校验, the crawled agent is divided into 20 parts, each thread verifies one copy
For I in range (20):
t = Proxycheck (rawproxylist[(Len (rawproxylist) +19)/20) * I: (len (rawproxylist) +19)/20) * (i+1)])
Checkthreads.append (t)
For I in range (len (checkthreads)):
Checkthreads[i].start ()
For I in range (len (checkthreads)):
Checkthreads[i].join ()
print '. ' *10+ "There is a total of%s agents through the checksum"%len (checkedproxylist) + '. ' *10
#插入数据库, table structure created itself, four fields ip,port,speed,address
def db_insert (insert_list):
Try
conn = MySQLdb.connect (host= "localhost", user= "root", passwd= "admin", db= "M_common", charset= ' UTF8 ')
cursor = Conn.cursor ()
Cursor.execute (' Delete from proxy ')
Cursor.execute (' ALTER TABLE proxy auto_increment=1 ')
Cursor.executemany ("INSERT into proxy (ip,port,speed,address) VALUES (%s,%s,%s,%s)", insert_list)
Conn.commit ()
Cursor.close ()
Conn.close ()
Except Mysqldb.error,e:
Print "Mysql Error%d:%s"% (E.args[0], e.args[1])
#代理排序持久化
PROXY_OK = []
F= Open ("Proxy_list.txt", ' w+ ')
For proxy in sorted (Checkedproxylist,cmp=lambda x,y:cmp (x[3],y[3)):
If PROXY[3] < 8:
#print "Checked proxy is:%s:%s\t%s\t%s"% (Proxy[0],proxy[1],proxy[2],proxy[3])
Proxy_ok.append ((proxy[0],proxy[1],proxy[3],proxy[2))
F.write ("%s:%s\t%s\t%s\n"% (proxy[0],proxy[1],proxy[2],proxy[3))
F.close ()
Db_insert (PROXY_OK)
The test results are as follows:
# python-v
Python 2.6.6
# python proxy.py
[' 61.58.94.179 ', ' 8088 ', ' \xe5\x8f\xb0\xe6\xb9\xbe\xe7\x9c\x81 \xe5\x8f\xb0\xe6\xb9\xbe\xe5\xae\xbd\xe9\xa2\x91\ Xe9\x80\x9a\xe8\xae\xaf\xe9\xa1\xbe\xe9\x97\xae\xe8\x82\xa1\xe4\xbb\xbd\xe6\x9c\x89\xe9\x99\x90\xe5\x85\xac\ Xe5\x8f\xb8 ']
[' 200.84.116.99 ', ' 9064 ', ' \xe5\xa7\x94\xe5\x86\x85\xe7\x91\x9e\xe6\x8b\x89 ']
[' 183.223.204.8 ', ' 8123 ', ' \xe5\x9b\x9b\xe5\xb7\x9d\xe7\x9c\x81\xe8\x87\xaa\xe8\xb4\xa1\xe5\xb8\x82 \xe7\xa7\xbb\ Xe5\x8a\xa8 ']
.......... A total of 1921 agents were crawled ...
.......... A total of 524 agents through the calibration ....
# more Proxy_list.txt
202.106.169.142:80 Beijing Unicom ADSL 0.291432857513
111.13.136.59:80 Beijing Mobile 0.297957897186
111.13.136.56:80 Beijing Mobile 0.373070955276
111.206.81.248:80 Beijing Unicom 0.403017997742
111.13.136.58:80 Beijing Mobile 0.414332151413
124.202.217.134:8118 Beijing Telecom Tong 0.416817903519
124.202.183.218:8118 Beijing Telecom Tong 0.426618099213
120.132.71.232:80 Beijing Unicom 0.440200090408
61.232.6.164:8081 Beijing railcom 0.469615936279
118.144.96.253:80 Beijing Telecom Tong 0.485229969025
203.192.10.66:80 Beijing Xinhua News agency 0.51485991478
124.202.182.22:8118 Beijing Telecom Tong 0.553130865097
Database:
Mysql> select * from M_common.proxy limit 10;
+----------+-----------------+------+----------+----------------------+---------------------+
| proxy_id | IP | Port | Speed | Address | Create_time |
+----------+-----------------+------+----------+----------------------+---------------------+
| 1 | 202.106.169.142 | 80 | 0.291433 | Beijing Unicom ADSL | 2015-02-26 11:29:24 |
| 2 | 111.13.136.59 | 80 | 0.297958 | Beijing Mobile | 2015-02-26 11:29:24 |
| 3 | 111.13.136.56 | 80 | 0.373071 | Beijing Mobile | 2015-02-26 11:29:24 |
| 4 | 111.206.81.248 | 80 | 0.403018 | Beijing Unicom | 2015-02-26 11:29:24 |
| 5 | 111.13.136.58 | 80 | 0.414332 | Beijing Mobile | 2015-02-26 11:29:24 |
| 6 | 124.202.217.134 | 8118 | 0.416818 | Beijing Telecom Communication | 2015-02-26 11:29:24 |
| 7 | 124.202.183.218 | 8118 | 0.426618 | Beijing Telecom Communication | 2015-02-26 11:29:24 |
| 8 | 120.132.71.232 | 80 | 0.4402 | Beijing Unicom | 2015-02-26 11:29:24 |
| 9 | 61.232.6.164 | 8081 | 0.469616 | Beijing railcom | 2015-02-26 11:29:24 |
| 10 | 118.144.96.253 | 80 | 0.48523 | Beijing Telecom Communication | 2015-02-26 11:29:24 |
+----------+-----------------+------+----------+----------------------+---------------------+
Rows in Set (0.00 sec)