Example of trying to crawl a proxy server IP address using python multithreading

Source: Internet
Author: User
Here is an example of a proxy server crawling the http://www.proxy.com.ru site, with the following code:

#!/usr/bin/env python#coding:utf-8import urllib2import reimport threadingimport timeimport MySQLdbrawProxyList = [] Checkedproxylist = [] #抓取代理网站targets = []for i in Xrange (1,42): target = r "http://www.proxy.com.ru/list_%d.html"% i Targ Ets.append (target) #抓取代理服务器正则p = Re.compile (R ' "(\d+) (. +?) (\d+) (.+?) (.+?)") #获取代理的类class Proxyget (threading. Thread): Def __init__ (self,target): Threading. Thread.__init__ (self) self.target = target def getproxy (self): print "Proxy server target site:" + self.target req = Urllib2.urlopen ( Self.target) result = Req.read () #print chardet.detect (result) Matchs = P.findall (result) # Print matchs for row in MA   TCHS:IP=ROW[1] Port =row[2] addr = Row[4].decode ("cp936"). Encode ("Utf-8") proxy = [IP,PORT,ADDR] Print Proxy Rawproxylist.append (proxy) def run (self): Self.getproxy () #检验代理的类class Proxycheck (threading. Thread): Def __init__ (self,proxylist): Threading. Thread.__init__ (self) self.proxylist = proxylist Self.timeout = 5 Self.testurl = "http://www.baidu.com/" Self.teststr = "030173" def CheckProxy (self): cookies = urllib2. Httpcookieprocessor () for proxy in Self.proxyList:proxyHandler = Urllib2. Proxyhandler ({"http": R ' http://%s:%s '% (Proxy[0],proxy[1])}) #print R ' http://%s:%s '% (proxy[0],proxy[1]) opener = URL Lib2.build_opener (Cookies,proxyhandlER) opener.addheaders = [(' User-agent ', ' mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) gecko/20100101 firefox/22.0 ')] #urllib2. Install_opener (opener) T1 = Time.time () Try: #req = Urllib2.urlo Pen ("http://www.baidu.com", timeout=self.timeout) req = Opener.open (Self.testurl, timeout=self.timeout) #print "Urlo Pen is ok .... "result = Req.read () #print" read html ... "timeused = Time.time ()-T1 pos = Result.find (self.t     ESTSTR) #print "Pos is%s"%pos if pos > 1:checkedproxylist.append ((proxy[0],proxy[1],proxy[2],timeused)) #print "OK IP:%s%s%s%s"% (proxy[0],proxy[1],proxy[2],timeused) else:continue except exception,e: #prin T E.message continue def run (self): Self.checkproxy () if __name__ = = "__main__": Getthreads = [] Checkthreads = [] #对每个目 The standard Web site opens a thread responsible for the crawl agent for I in range (len (targets)): T = Proxyget (Targets[i]) getthreads.append (t) for I in Range (len (getthreads) ): Getthreads[i].start () for I in Range (len (getthreads)): Getthreads[i].join () print '. ' *10+ "Total fetch of%s agents"%len (rawproxylist) + '. ' *10# Open 20 threads responsible for validation, the captured agent is divided into 20 parts, each thread check one copy for I in range: t = Proxycheck (rawproxylist[(Len (rawproxylist) +19) * I: ( (Len (rawproxylist) +19)/20) * (i+1)) Checkthreads.append (t) for I in Range (len (checkthreads)): Checkthreads[i].start () For I in range (len (checkthreads)): Checkthreads[i].join () print '. ' *10+ "A total of%s agents by checksum"%len (checkedproxylist) + '. ' *10# Insert Database, table structure created by itself, four fields Ip,port,speed,addressdef Db_insert (insert_list): Try:conn = MySQLdb.connect (host= "localhost"  , user= "root", passwd= "admin", db= "M_common", charset= ' UTF8 ') cursor = Conn.cursor () cursor.execute (' Delete from proxy ') Cursor.execute (' ALTER TABLE proxy auto_increment=1 ') cursor.executemany ("INSERT into proxy (ip,port,speed,address) VALUES (%s,%s,%s,%s) ", Insert_list) Conn.commit () Cursor.close () conn.close () except Mysqldb.error,e:print" Mysql Erro R%d:%s "% (E.args[0], e.args[1]) #代理排序持久化proxy_ok = []f= open (" Proxy_list.txt ", ' w+ ') for proxy in sorted (checkedproxylist, Cmp=lambda x,y:cmp (x[3],y[3]): if PROXY[3] < 8: #print "Checked proxy is:%s:%s\t%s\t%s"% (proxy[0],proxy[1],proxy[2 ],PROXY[3]) Proxy_ok.append ((proxy[0],proxy[1],proxy[3],proxy[2)) f.write ("%s:%s\t%s\t%s\n"% (Proxy[0],proxy[1], PROXY[2],PROXY[3]) F.close () Db_insert (PROXY_OK)

Test:

Python proxy.py

The results are as follows:

[' 61.58.94.179 ', ' 8088 ', ' \xe5\x8f\xb0\xe6\xb9\xbe\xe7\x9c\x81 \xe5\x8f\xb0\xe6\xb9\xbe\xe5\xae\xbd\xe9\xa2\x91\ Xe9\x80\x9a\xe8\xae\xaf\xe9\xa1\xbe\xe9\x97\xae\xe8\x82\xa1\xe4\xbb\xbd\xe6\x9c\x89\xe9\x99\x90\xe5\x85\xac\ Xe5\x8f\xb8 ' [' 200.84.116.99 ', ' 9064 ', ' \xe5\xa7\x94\xe5\x86\x85\xe7\x91\x9e\xe6\x8b\x89 ' [' 183.223.204.8 ', ' 8123 ', ' \xe5\x9b\x9b\xe5\xb7\x9d\xe7\x9c\x81\xe8\x87\xaa\xe8\xb4\xa1\xe5\xb8\x82 \xe7\xa7\xbb\xe5\x8a\xa8 '] .......... A total of 1921 agents have been crawled ....... ..... A total of 524 agents by verifying ... # more proxy_list.txt202.106.169.142:80 Beijing Unicom ADSL 0.291432857513111.13.136.59:80 Beijing Mobile 0.2979 57897186111.13.136.56:80 Beijing Mobile 0.373070955276111.206.81.248:80 Beijing Unicom 0.403017997742111.13.136.58:80 Beijing Mobile 0.41433 2151413124.202.217.134:8118 Beijing Telecom Tong 0.416817903519124.202.183.218:8118 Beijing Telecom Tong 0.426618099213120.132.71.232:80 Beijing Unicom 0 .44020009040861.232.6.164:8081 Beijing CRC 0.469615936279118.144.96.253:80 Beijing Telecom Tong 0.485229969025203.192.10.66:80 Beijing Xinhua News Agency 0 .51485991478124.202.182.22:8118 BeijingCity Telecom Pass 0.553130865097 

Database:

Mysql> select * from M_common.proxy limit 10;

+----------+-----------------+------+----------+----------------------+---------------------+| proxy_id | IP | Port | Speed | Address |  Create_time |+----------+-----------------+------+----------+----------------------+---------------------+| 1 | 202.106.169.142 | 80 | 0.291433 | Beijing Unicom ADSL |  2015-02-26 11:29:24 | | 2 | 111.13.136.59 | 80 | 0.297958 | Beijing Mobile |  2015-02-26 11:29:24 | | 3 | 111.13.136.56 | 80 | 0.373071 | Beijing Mobile |  2015-02-26 11:29:24 | | 4 | 111.206.81.248 | 80 | 0.403018 | Beijing Unicom |  2015-02-26 11:29:24 | | 5 | 111.13.136.58 | 80 | 0.414332 | Beijing Mobile |  2015-02-26 11:29:24 | | 6 | 124.202.217.134 | 8118 | 0.416818 | Beijing Telecom Pass |  2015-02-26 11:29:24 | | 7 | 124.202.183.218 | 8118 | 0.426618 | Beijing Telecom Pass |  2015-02-26 11:29:24 | | 8 | 120.132.71.232 | 80 | 0.4402 | Beijing Unicom |  2015-02-26 11:29:24 | | 9 | 61.232.6.164 | 8081 | 0.469616 | Beijing CTT |  2015-02-26 11:29:24 | | 10 | 118.144.96.253 | 80 | 0.48523 | Beijing Telecom Pass | 2015-02-26 11:29:24 |+----------+-----------------+------+----------+----------------------+---------------------+10 rows in Set (0.00 sec) 
  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.