This article mainly describes how to capture the IP address of a proxy server using multiple Python threads. despite the existence of GIL, Python cannot truly implement multi-thread parallel processing, A friend can refer to here to capture the http://www.proxy.com.ru site proxy server as an example, the code is as follows:
#! /Usr/bin/env python # coding: utf-8import urllib2import reimport threadingimport timeimport MySQLdbrawProxyList = [] checkedProxyList = [] # capture proxy website targets = [] for I in xrange (1, 42 ): target = r "http://www.proxy.com.ru/list_%d.html" % I targets. append (target) # capture proxy server regular p = re. compile (r '''(\ D +)(. + ?)(\ D +)(. + ?)(. + ?)''') # Obtain the proxy class ProxyGet (threading. thread): def _ init _ (self, target): threading. thread. _ init _ (self) self.tar get = target def getProxy (self): print "proxy server target website:" + self.tar get req = urllib2.urlopen(self.tar get) result = req. read () # print chardet. detect (result) matchs = p. findall (result) # print matchs for row in matchs: ip = row [1] port = row [2] addr = row [4]. decode ("cp936 "). encode ("UTF-8") proxy = [ip, port, addr] print proxy rawProxyList. append (proxy) def run (self): self. getProxy () # verify the proxy class ProxyCheck (threading. thread): def _ init _ (self, proxyList): threading. thread. _ init _ (self) self. proxyList = proxyList self. timeout = 5 self. testUrl =" http://www.baidu.com /"Self. testStr =" 030173 "def checkProxy (self): cookies = urllib2.HTTPCookieProcessor () for proxy in self. proxyList: proxyHandler = urllib2.ProxyHandler ({" http ": R' http://%s:%s '% (Proxy [0], proxy [1])}) # print R' http://%s:%s '% (Proxy [0], proxy [1]) opener = urllib2.build _ opener (cookies, proxyHandler) opener. addheaders = [('User-agent', 'mozilla/5.0 (Windows NT 6.2; WOW64; rv: 22.0) Gecko/20100101 Firefox/22.0 ')] # urllib2.install _ opener (opener) t1 = time. time () try: # req = urllib2.urlopen (" http://www.baidu.com ", Timeout = self. timeout) req = opener. open (self. testUrl, timeout = self. timeout) # print "urlopen is OK .... "result = req. read () # print "read html .... "timeused = time. time ()-t1 pos = result. find (self. testStr) # print "pos is % s" % pos if pos> 1: checkedProxyList. append (proxy [0], proxy [1], proxy [2], timeused) # print "OK ip: % s "% (proxy [0], proxy [1], proxy [2], timeused) else: continue failed t Exception, e: # print e. message continue def run (self): self. checkProxy () if _ name _ = "_ main __": getThreads = [] checkThreads = [] # Start a thread for each target website to capture the proxy for I in range (len (targets): t = ProxyGet (targets [I]) getThreads. append (t) for I in range (len (getThreads): getThreads [I]. start () for I in range (len (getThreads): getThreads [I]. join () print '. '* 10 + "captured % s proxies in total" % len (rawProxyList) + '. '* 10 # enable 20 threads for verification, and divide the captured proxy into 20 parts. each thread verifies one copy for I in range (20 ): t = ProxyCheck (rawProxyList [(len (rawProxyList) + 19)/20) * I :( (len (rawProxyList) + 19)/20) * (I + 1)]) checkThreads. append (t) for I in range (len (checkThreads): checkThreads [I]. start () for I in range (len (checkThreads): checkThreads [I]. join () print '. '* 10 + "a total of % s proxies passed the verification" % len (checkedProxyList) + '. '* 10 # insert a database. the table structure is created by yourself. four fields are ip address, port, speed, and addressdef db_insert (insert_list): try: conn = MySQLdb. connect (host = "localhost", user = "root", passwd = "admin", db = "m_common", charset = 'utf8') cursor = conn. cursor () cursor.exe cute ('delete from proxy') cursor.exe cute ('alter table proxy AUTO_INCREMENT = 1') cursor.exe cute.pdf ("insert into proxy (ip, port, speed, address) VALUES (% s, % s) ", insert_list) conn. commit () cursor. close () conn. close () distinct T MySQLdb. error, e: print "Mysql Error % d: % s" % (e. args [0], e. args [1]) # proxy sorting persistence proxy_ OK = [] f = open ("proxy_list.txt", 'W + ') for proxy in sorted (checkedProxyList, cmp = lambda x, y: cmp (x [3], y [3]): if proxy [3] <8: # print "checked proxy is: % s: % s \ t % s "% (proxy [0], proxy [1], proxy [2], proxy [3]) proxy_ OK .append (proxy [0], proxy [1], proxy [3], proxy [2]) f. write ("% s: % s \ t % s \ n" % (proxy [0], proxy [1], proxy [2], proxy [3]) f. close () db_insert (proxy_ OK)
Test:
python proxy.py
The result is as follows:
['61. 58.94.179 ', '123 ', '\ xe5 \ x8f \ xb0 \ xe6 \ xb9 \ xbe \ xe7 \ x9c \ x81 \ xe5 \ x8f \ xb0 \ xe6 \ xb9 \ xbe \ xe5 \ xae \ xbd \ xe9 \ xa2 \ x91 \ xe9 \ x80 \ x9a \ xe8 \ xae \ xaf \ xe9 \ xa1 \ xbe \ xe9 \ x97 \ xae \ xe8 \ x82 \ xa1 \ xe4 \ xbb \ xbd \ xe6 \ x9c \ x89 \ xe9 \ x99 \ x90 \ xe5 \ x85 \ xac \ xe5 \ x8f \ xb8'] ['2017. 84.116.99 ', '000000',' \ xe5 \ xa7 \ x94 \ xe5 \ x86 \ x85 \ xe7 \ x91 \ x9e \ xe6 \ x8b \ x89 '] ['2017. 223.204.8 ', '123 ', '\ xe5 \ x9b \ x9b \ xe5 \ xb7 \ x9d \ xe7 \ x9c \ x81 \ xe8 \ x87 \ xaa \ xe8 \ xb4 \ xa1 \ xe5 \ xb8 \ x82 \ xe7 \ xa7 \ xbb \ xe5 \ x8a \ xa8 '] ...... A total of 1921 proxies are crawled .................... A total of 524 proxies have passed verification .......... # more information: 80 Beijing Unicom ADSL region: 80 Beijing mobile terminal: 80 Beijing mobile terminal: 80 Beijing Unicom 0.403017997742111.13.136.58: 80 Beijing mobile terminal: 8118 Beijing Telecom 0.20.17903519124.202.183.218: 8118 Beijing telecom terminal: 80 Beijing Unicom 0.20.20009040861.232.6.164: 8081 Beijing Tietong region: 80 Beijing communication channel 0.485229969025203.192.10.66: 80 Beijing Xinhua News Agency 0.51485991478124.202.182.22: 8118 Beijing communication channel 0.553130865097
Database:
mysql> select * from m_common.proxy limit 10;
+ ---------- + ----------------- + ------ + ---------- + Hour + | proxy_id | ip | port | speed | address | create_time | + ---------- + --------------- + ------ + ---------- + hour + | 1 | 202.106.169.142 | 80 | 0.291433 | Beijing Unicom ADSL | 11:29:24 | 2 | 111.13.136.59 | 80 | 0.297958 | Beijing Mobile | 11:29:24 | 3 | 111.13.136.56 | 80 | 0.373071 | Beijing mobile | 11:29:24 | 4 | 111.206.81.248 | 80 | 0.403018 | Beijing Unicom | 11:29:24 | 5 | 111.13.136.58 | 80 | 0.414332 | Beijing Mobile | 11:29:24 | 6 | 124.202.217.134 | 8118 | 0.416818 | Beijing Telecom | 11:29:24 | 7 | 124.202.183.218 | 8118 | 0.426618 | Beijing Telecom | 11:29:24 | 8 | 120.132.71.133 | 80 | 0.4402 | Beijing Unicom | 11:29:24 | | 9 | 61.232.6.164 | 8081 | 0.469616 | Beijing Tietong | 11:29:24 | 10 | 118.144.96.253 | 80 | 0.48523 | Beijing Telecom | 11:29:24 | + ------------ + --------------- + ------ + ---------- + large ---------------------- + --------------------- + 10 rows in set (0.00 sec)