Here is an example of a proxy server crawling the http://www.proxy.com.ru site, with the following code:
#!/usr/bin/env python#coding:utf-8import urllib2import reimport threadingimport timeimport MySQLdbrawProxyList = [] Checkedproxylist = [] #抓取代理网站targets = []for i in Xrange (1,42): target = r "http://www.proxy.com.ru/list_%d.html"% i Targ Ets.append (target) #抓取代理服务器正则p = Re.compile (R ' "(\d+) (. +?) (\d+) (.+?) (.+?)") #获取代理的类class Proxyget (threading. Thread): Def __init__ (self,target): Threading. Thread.__init__ (self) self.target = target def getproxy (self): print "Proxy server target site:" + self.target req = Urllib2.urlopen ( Self.target) result = Req.read () #print chardet.detect (result) Matchs = P.findall (result) # Print matchs for row in MA TCHS:IP=ROW[1] Port =row[2] addr = Row[4].decode ("cp936"). Encode ("Utf-8") proxy = [IP,PORT,ADDR] Print Proxy Rawproxylist.append (proxy) def run (self): Self.getproxy () #检验代理的类class Proxycheck (threading. Thread): Def __init__ (self,proxylist): Threading. Thread.__init__ (self) self.proxylist = proxylist Self.timeout = 5 Self.testurl = "http://www.baidu.com/" Self.teststr = "030173" def CheckProxy (self): cookies = urllib2. Httpcookieprocessor () for proxy in Self.proxyList:proxyHandler = Urllib2. Proxyhandler ({"http": R ' http://%s:%s '% (Proxy[0],proxy[1])}) #print R ' http://%s:%s '% (proxy[0],proxy[1]) opener = URL Lib2.build_opener (Cookies,proxyhandlER) opener.addheaders = [(' User-agent ', ' mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) gecko/20100101 firefox/22.0 ')] #urllib2. Install_opener (opener) T1 = Time.time () Try: #req = Urllib2.urlo Pen ("http://www.baidu.com", timeout=self.timeout) req = Opener.open (Self.testurl, timeout=self.timeout) #print "Urlo Pen is ok .... "result = Req.read () #print" read html ... "timeused = Time.time ()-T1 pos = Result.find (self.t ESTSTR) #print "Pos is%s"%pos if pos > 1:checkedproxylist.append ((proxy[0],proxy[1],proxy[2],timeused)) #print "OK IP:%s%s%s%s"% (proxy[0],proxy[1],proxy[2],timeused) else:continue except exception,e: #prin T E.message continue def run (self): Self.checkproxy () if __name__ = = "__main__": Getthreads = [] Checkthreads = [] #对每个目 The standard Web site opens a thread responsible for the crawl agent for I in range (len (targets)): T = Proxyget (Targets[i]) getthreads.append (t) for I in Range (len (getthreads) ): Getthreads[i].start () for I in Range (len (getthreads)): Getthreads[i].join () print '. ' *10+ "Total fetch of%s agents"%len (rawproxylist) + '. ' *10# Open 20 threads responsible for validation, the captured agent is divided into 20 parts, each thread check one copy for I in range: t = Proxycheck (rawproxylist[(Len (rawproxylist) +19) * I: ( (Len (rawproxylist) +19)/20) * (i+1)) Checkthreads.append (t) for I in Range (len (checkthreads)): Checkthreads[i].start () For I in range (len (checkthreads)): Checkthreads[i].join () print '. ' *10+ "A total of%s agents by checksum"%len (checkedproxylist) + '. ' *10# Insert Database, table structure created by itself, four fields Ip,port,speed,addressdef Db_insert (insert_list): Try:conn = MySQLdb.connect (host= "localhost" , user= "root", passwd= "admin", db= "M_common", charset= ' UTF8 ') cursor = Conn.cursor () cursor.execute (' Delete from proxy ') Cursor.execute (' ALTER TABLE proxy auto_increment=1 ') cursor.executemany ("INSERT into proxy (ip,port,speed,address) VALUES (%s,%s,%s,%s) ", Insert_list) Conn.commit () Cursor.close () conn.close () except Mysqldb.error,e:print" Mysql Erro R%d:%s "% (E.args[0], e.args[1]) #代理排序持久化proxy_ok = []f= open (" Proxy_list.txt ", ' w+ ') for proxy in sorted (checkedproxylist, Cmp=lambda x,y:cmp (x[3],y[3]): if PROXY[3] < 8: #print "Checked proxy is:%s:%s\t%s\t%s"% (proxy[0],proxy[1],proxy[2 ],PROXY[3]) Proxy_ok.append ((proxy[0],proxy[1],proxy[3],proxy[2)) f.write ("%s:%s\t%s\t%s\n"% (Proxy[0],proxy[1], PROXY[2],PROXY[3]) F.close () Db_insert (PROXY_OK)
Test:
Python proxy.py
The results are as follows:
[' 61.58.94.179 ', ' 8088 ', ' \xe5\x8f\xb0\xe6\xb9\xbe\xe7\x9c\x81 \xe5\x8f\xb0\xe6\xb9\xbe\xe5\xae\xbd\xe9\xa2\x91\ Xe9\x80\x9a\xe8\xae\xaf\xe9\xa1\xbe\xe9\x97\xae\xe8\x82\xa1\xe4\xbb\xbd\xe6\x9c\x89\xe9\x99\x90\xe5\x85\xac\ Xe5\x8f\xb8 ' [' 200.84.116.99 ', ' 9064 ', ' \xe5\xa7\x94\xe5\x86\x85\xe7\x91\x9e\xe6\x8b\x89 ' [' 183.223.204.8 ', ' 8123 ', ' \xe5\x9b\x9b\xe5\xb7\x9d\xe7\x9c\x81\xe8\x87\xaa\xe8\xb4\xa1\xe5\xb8\x82 \xe7\xa7\xbb\xe5\x8a\xa8 '] .......... A total of 1921 agents have been crawled ....... ..... A total of 524 agents by verifying ... # more proxy_list.txt202.106.169.142:80 Beijing Unicom ADSL 0.291432857513111.13.136.59:80 Beijing Mobile 0.2979 57897186111.13.136.56:80 Beijing Mobile 0.373070955276111.206.81.248:80 Beijing Unicom 0.403017997742111.13.136.58:80 Beijing Mobile 0.41433 2151413124.202.217.134:8118 Beijing Telecom Tong 0.416817903519124.202.183.218:8118 Beijing Telecom Tong 0.426618099213120.132.71.232:80 Beijing Unicom 0 .44020009040861.232.6.164:8081 Beijing CRC 0.469615936279118.144.96.253:80 Beijing Telecom Tong 0.485229969025203.192.10.66:80 Beijing Xinhua News Agency 0 .51485991478124.202.182.22:8118 BeijingCity Telecom Pass 0.553130865097
Database:
Mysql> select * from M_common.proxy limit 10;
+----------+-----------------+------+----------+----------------------+---------------------+| proxy_id | IP | Port | Speed | Address | Create_time |+----------+-----------------+------+----------+----------------------+---------------------+| 1 | 202.106.169.142 | 80 | 0.291433 | Beijing Unicom ADSL | 2015-02-26 11:29:24 | | 2 | 111.13.136.59 | 80 | 0.297958 | Beijing Mobile | 2015-02-26 11:29:24 | | 3 | 111.13.136.56 | 80 | 0.373071 | Beijing Mobile | 2015-02-26 11:29:24 | | 4 | 111.206.81.248 | 80 | 0.403018 | Beijing Unicom | 2015-02-26 11:29:24 | | 5 | 111.13.136.58 | 80 | 0.414332 | Beijing Mobile | 2015-02-26 11:29:24 | | 6 | 124.202.217.134 | 8118 | 0.416818 | Beijing Telecom Pass | 2015-02-26 11:29:24 | | 7 | 124.202.183.218 | 8118 | 0.426618 | Beijing Telecom Pass | 2015-02-26 11:29:24 | | 8 | 120.132.71.232 | 80 | 0.4402 | Beijing Unicom | 2015-02-26 11:29:24 | | 9 | 61.232.6.164 | 8081 | 0.469616 | Beijing CTT | 2015-02-26 11:29:24 | | 10 | 118.144.96.253 | 80 | 0.48523 | Beijing Telecom Pass | 2015-02-26 11:29:24 |+----------+-----------------+------+----------+----------------------+---------------------+10 rows in Set (0.00 sec)