Example of trying to capture the IP address of the proxy server using Python multi-thread

Source: Internet
Author: User
This article mainly describes how to capture the IP address of a proxy server using multiple Python threads. despite the existence of GIL, Python cannot truly implement multi-thread parallel processing, A friend can refer to here to capture the http://www.proxy.com.ru site proxy server as an example, the code is as follows:

#! /Usr/bin/env python # coding: utf-8import urllib2import reimport threadingimport timeimport MySQLdbrawProxyList = [] checkedProxyList = [] # capture proxy website targets = [] for I in xrange (1, 42 ): target = r "http://www.proxy.com.ru/list_%d.html" % I targets. append (target) # capture proxy server regular p = re. compile (r '''(\ D +)(. + ?)(\ D +)(. + ?)(. + ?)''') # Obtain the proxy class ProxyGet (threading. thread): def _ init _ (self, target): threading. thread. _ init _ (self) self.tar get = target def getProxy (self): print "proxy server target website:" + self.tar get req = urllib2.urlopen(self.tar get) result = req. read () # print chardet. detect (result) matchs = p. findall (result) # print matchs for row in matchs: ip = row [1] port = row [2] addr = row [4]. decode ("cp936 "). encode ("UTF-8") proxy = [ip, port, addr] print proxy rawProxyList. append (proxy) def run (self): self. getProxy () # verify the proxy class ProxyCheck (threading. thread): def _ init _ (self, proxyList): threading. thread. _ init _ (self) self. proxyList = proxyList self. timeout = 5 self. testUrl =" http://www.baidu.com /"Self. testStr =" 030173 "def checkProxy (self): cookies = urllib2.HTTPCookieProcessor () for proxy in self. proxyList: proxyHandler = urllib2.ProxyHandler ({" http ": R' http://%s:%s '% (Proxy [0], proxy [1])}) # print R' http://%s:%s '% (Proxy [0], proxy [1]) opener = urllib2.build _ opener (cookies, proxyHandler) opener. addheaders = [('User-agent', 'mozilla/5.0 (Windows NT 6.2; WOW64; rv: 22.0) Gecko/20100101 Firefox/22.0 ')] # urllib2.install _ opener (opener) t1 = time. time () try: # req = urllib2.urlopen (" http://www.baidu.com ", Timeout = self. timeout) req = opener. open (self. testUrl, timeout = self. timeout) # print "urlopen is OK .... "result = req. read () # print "read html .... "timeused = time. time ()-t1 pos = result. find (self. testStr) # print "pos is % s" % pos if pos> 1: checkedProxyList. append (proxy [0], proxy [1], proxy [2], timeused) # print "OK ip: % s "% (proxy [0], proxy [1], proxy [2], timeused) else: continue failed t Exception, e: # print e. message continue def run (self): self. checkProxy () if _ name _ = "_ main __": getThreads = [] checkThreads = [] # Start a thread for each target website to capture the proxy for I in range (len (targets): t = ProxyGet (targets [I]) getThreads. append (t) for I in range (len (getThreads): getThreads [I]. start () for I in range (len (getThreads): getThreads [I]. join () print '. '* 10 + "captured % s proxies in total" % len (rawProxyList) + '. '* 10 # enable 20 threads for verification, and divide the captured proxy into 20 parts. each thread verifies one copy for I in range (20 ): t = ProxyCheck (rawProxyList [(len (rawProxyList) + 19)/20) * I :( (len (rawProxyList) + 19)/20) * (I + 1)]) checkThreads. append (t) for I in range (len (checkThreads): checkThreads [I]. start () for I in range (len (checkThreads): checkThreads [I]. join () print '. '* 10 + "a total of % s proxies passed the verification" % len (checkedProxyList) + '. '* 10 # insert a database. the table structure is created by yourself. four fields are ip address, port, speed, and addressdef db_insert (insert_list): try: conn = MySQLdb. connect (host = "localhost", user = "root", passwd = "admin", db = "m_common", charset = 'utf8') cursor = conn. cursor () cursor.exe cute ('delete from proxy') cursor.exe cute ('alter table proxy AUTO_INCREMENT = 1') cursor.exe cute.pdf ("insert into proxy (ip, port, speed, address) VALUES (% s, % s) ", insert_list) conn. commit () cursor. close () conn. close () distinct T MySQLdb. error, e: print "Mysql Error % d: % s" % (e. args [0], e. args [1]) # proxy sorting persistence proxy_ OK = [] f = open ("proxy_list.txt", 'W + ') for proxy in sorted (checkedProxyList, cmp = lambda x, y: cmp (x [3], y [3]): if proxy [3] <8: # print "checked proxy is: % s: % s \ t % s "% (proxy [0], proxy [1], proxy [2], proxy [3]) proxy_ OK .append (proxy [0], proxy [1], proxy [3], proxy [2]) f. write ("% s: % s \ t % s \ n" % (proxy [0], proxy [1], proxy [2], proxy [3]) f. close () db_insert (proxy_ OK)

Test:

python proxy.py

The result is as follows:

['61. 58.94.179 ', '123 ', '\ xe5 \ x8f \ xb0 \ xe6 \ xb9 \ xbe \ xe7 \ x9c \ x81 \ xe5 \ x8f \ xb0 \ xe6 \ xb9 \ xbe \ xe5 \ xae \ xbd \ xe9 \ xa2 \ x91 \ xe9 \ x80 \ x9a \ xe8 \ xae \ xaf \ xe9 \ xa1 \ xbe \ xe9 \ x97 \ xae \ xe8 \ x82 \ xa1 \ xe4 \ xbb \ xbd \ xe6 \ x9c \ x89 \ xe9 \ x99 \ x90 \ xe5 \ x85 \ xac \ xe5 \ x8f \ xb8'] ['2017. 84.116.99 ', '000000',' \ xe5 \ xa7 \ x94 \ xe5 \ x86 \ x85 \ xe7 \ x91 \ x9e \ xe6 \ x8b \ x89 '] ['2017. 223.204.8 ', '123 ', '\ xe5 \ x9b \ x9b \ xe5 \ xb7 \ x9d \ xe7 \ x9c \ x81 \ xe8 \ x87 \ xaa \ xe8 \ xb4 \ xa1 \ xe5 \ xb8 \ x82 \ xe7 \ xa7 \ xbb \ xe5 \ x8a \ xa8 '] ...... A total of 1921 proxies are crawled .................... A total of 524 proxies have passed verification .......... # more information: 80 Beijing Unicom ADSL region: 80 Beijing mobile terminal: 80 Beijing mobile terminal: 80 Beijing Unicom 0.403017997742111.13.136.58: 80 Beijing mobile terminal: 8118 Beijing Telecom 0.20.17903519124.202.183.218: 8118 Beijing telecom terminal: 80 Beijing Unicom 0.20.20009040861.232.6.164: 8081 Beijing Tietong region: 80 Beijing communication channel 0.485229969025203.192.10.66: 80 Beijing Xinhua News Agency 0.51485991478124.202.182.22: 8118 Beijing communication channel 0.553130865097

Database:

mysql> select * from m_common.proxy limit 10;

+ ---------- + ----------------- + ------ + ---------- + Hour + | proxy_id | ip | port | speed | address | create_time | + ---------- + --------------- + ------ + ---------- + hour + | 1 | 202.106.169.142 | 80 | 0.291433 | Beijing Unicom ADSL | 11:29:24 | 2 | 111.13.136.59 | 80 | 0.297958 | Beijing Mobile | 11:29:24 | 3 | 111.13.136.56 | 80 | 0.373071 | Beijing mobile | 11:29:24 | 4 | 111.206.81.248 | 80 | 0.403018 | Beijing Unicom | 11:29:24 | 5 | 111.13.136.58 | 80 | 0.414332 | Beijing Mobile | 11:29:24 | 6 | 124.202.217.134 | 8118 | 0.416818 | Beijing Telecom | 11:29:24 | 7 | 124.202.183.218 | 8118 | 0.426618 | Beijing Telecom | 11:29:24 | 8 | 120.132.71.133 | 80 | 0.4402 | Beijing Unicom | 11:29:24 | | 9 | 61.232.6.164 | 8081 | 0.469616 | Beijing Tietong | 11:29:24 | 10 | 118.144.96.253 | 80 | 0.48523 | Beijing Telecom | 11:29:24 | + ------------ + --------------- + ------ + ---------- + large ---------------------- + --------------------- + 10 rows in set (0.00 sec)

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.