I. Function Description:
1. Capture the proxy server in multiple threads and verify the proxy server in multiple threads
The ps proxy server is crawled from the http://www.cnproxy.com/(the test selects only 8 pages)
2. Capture the image address of a website and use multiple threads to randomly download images from a proxy server.
II. Implementation Code
Copy codeThe Code is as follows:
#! /Usr/bin/env python
# Coding: UTF-8
Import urllib2
Import re
Import threading
Import time
Import random
RawProxyList = []
CheckedProxyList = []
Imgurl_list = []
# Crawling proxy websites
Portdicts = {'V': "3", 'M': "4", 'A': "2", 'L': "9", 'q ': "0", 'B': "5", 'I': "7", 'w': "6", 'r': "8", 'C ': "1 "}
Targets = []
For I in xrange (1, 9 ):
Target = r "http://www.cnproxy.com/proxy%d.html" % I
Targets. append (target)
# Print targets
# Capture Proxy Server Regular Expressions
P = re. compile (r''' <tr> <td> (. + ?) <SCRIPT type = text/javascript> document. write \ (":" \ + (. + ?) \) </SCRIPT> </td> <td> (. + ?) </Td> <td>. +? </Td> <td> (. + ?) </Td> </tr> ''')
# Obtain the proxy class
Class ProxyGet (threading. Thread ):
Def _ init _ (self, target ):
Threading. Thread. _ init _ (self)
Self.tar get = target
Def getProxy (self ):
Print "Proxy Server target Website:" + self.tar get
Req = urllib2.urlopen(self.tar get)
Result = req. read ()
# Print chardet. detect (result)
Matchs = p. findall (result)
For row in matchs:
Ip = row [0]
Port = row [1]
Port = map (lambda x: portdicts [x], port. split ('+ '))
Port = ''. join (port)
Agent = row [2]
Addr = row [3]. decode ("cp936"). encode ("UTF-8 ")
Proxy = [ip, port, addr]
# Print proxy
RawProxyList. append (proxy)
Def run (self ):
Self. getProxy ()
# Verify the proxy class
Class ProxyCheck (threading. Thread ):
Def _ init _ (self, proxyList ):
Threading. Thread. _ init _ (self)
Self. proxyList = proxyList
Self. timeout = 5
Self. testUrl = "http://www.baidu.com /"
Self. testStr = "030173"
Def checkProxy (self ):
Cookies = urllib2.HTTPCookieProcessor ()
For proxy in self. proxyList:
ProxyHandler = urllib2.ProxyHandler ({"http": r 'HTTP: // % s: % s' % (proxy [0], proxy [1])})
# Print r 'HTTP: // % s: % s' % (proxy [0], proxy [1])
Opener = urllib2.build _ opener (cookies, proxyHandler)
Opener. addheaders = [('user-agent', 'mozilla/5.0 (Windows NT 6.2; WOW64; rv: 22.0) Gecko/20100101 Firefox/22.0 ')]
# Urllib2.install _ opener (opener)
T1 = time. time ()
Try:
# Req = urllib2.urlopen ("http://www.baidu.com", timeout = self. timeout)
Req = opener. open (self. testUrl, timeout = self. timeout)
# Print "urlopen is OK ...."
Result = req. read ()
# Print "read html ...."
Timeused = time. time ()-t1
Pos = result. find (self. testStr)
# Print "pos is % s" % pos
If pos> 1:
CheckedProxyList. append (proxy [0], proxy [1], proxy [2], timeused ))
# Print "OK ip: % s" % (proxy [0], proxy [1], proxy [2], timeused)
Else:
Continue
Except t Exception, e:
# Print e. message
Continue
Def run (self ):
Self. checkProxy ()
# Image address retrieval Function
Def imgurlList (url_home ):
Global imgurl_list
Home_page = urllib2.urlopen (url_home)
Url_re = re. compile (R' <li> <a href = "(. + ?) "Target =" _ blank "rel =" nofollow "> ')
Pic_re = re. compile (R' Url_list = re. findall (url_re, home_page.read ())
For url in url_list:
# Print url_home + url
Url_page = urllib2.urlopen (url_home + url)
For imgurlList in re. findall (pic_re, url_page.read ()):
Imgurl_list.append (imgurlList)
# Download image class
Class getPic (threading. Thread ):
Def _ init _ (self, imgurl_list ):
Threading. Thread. _ init _ (self)
Self. imgurl_list = imgurl_list
Self. timeout = 5
Def downloadimg (self ):
For imgurl in self. imgurl_list:
Pic_suffix = imgurl. split ('.') [-1] # obtain the image suffix
Pic_name = str (random. randint (0, 0000000000) + '.' + pic_suffix
Cookies = urllib2.HTTPCookieProcessor ()
RandomCheckedProxy = random. choice (checkedProxyList) # randomly retrieve a group of proxy servers
ProxyHandler = urllib2.ProxyHandler ({"http": r 'HTTP: // % s: % s' % (randomCheckedProxy [0], randomCheckedProxy [1])})
Opener = urllib2.build _ opener (cookies, proxyHandler)
Opener. addheaders = [('user-agent', 'mozilla/5.0 (Windows NT 6.2; WOW64; rv: 22.0) Gecko/20100101 Firefox/22.0 ')]
Urllib2.install _ opener (opener)
Try:
Data_img = opener. open (imgurl, timeout = self. timeout)
F = open (pic_name, 'wb ')
F. write (data_img.read ())
F. close ()
Except t:
Continue
Def run (self ):
Self. downloadimg ()
If _ name _ = "_ main __":
GetThreads = []
CheckThreads = []
ImgurlList ('HTTP: // www.ivsky.com ')
GetPicThreads = []
# Enable a thread for each target website to capture the proxy
For I in range (len (targets )):
T = ProxyGet (targets [I])
GetThreads. append (t)
For I in range (len (getThreads )):
GetThreads [I]. start ()
For I in range (len (getThreads )):
GetThreads [I]. join ()
Print '.' * 10 + "% s proxies captured in total" % len (rawProxyList) + '.' * 10
# Enable 20 threads for verification, and divide the captured proxy into 20 copies. Each thread verifies one copy.
For I in range (20 ):
T = ProxyCheck (rawProxyList [(len (rawProxyList) + 19)/20) * I :( (len (rawProxyList) + 19)/20) * (I + 1)])
CheckThreads. append (t)
For I in range (len (checkThreads )):
CheckThreads [I]. start ()
For I in range (len (checkThreads )):
CheckThreads [I]. join ()
Print '.' * 10 + "% s proxies passed the verification" % len (checkedProxyList) + '.' * 10
# Enable 20 threads to randomly fetch a proxy to download images
For I in range (20 ):
T = getPic (imgurl_list [(len (imgurl_list) + 19)/20) * I :( (len (imgurl_list) + 19)/20) * (I + 1)])
GetPicThreads. append (t)
For I in range (len (getPicThreads )):
GetPicThreads [I]. start ()
For I in range (len (getPicThreads )):
GetPicThreads [I]. join ()
Print '.' * 10 + "% s total image downloads" % len (imgurl_list) + '.' * 10
# Proxy sorting persistence
F = open ("proxy_list.txt", 'W + ')
For proxy in sorted (checkedProxyList, cmp = lambda x, y: cmp (x [3], y [3]):
# Print "checked proxy is: % s \ t % s" % (proxy [0], proxy [1], proxy [2], proxy [3])
F. write ("% s: % s \ t % s \ n" % (proxy [0], proxy [1], proxy [2], proxy [3])
F. close ()
Ii. Test results:
Copy codeThe Code is as follows:
# Ls
Proxy_getpic.py
# Python proxy_getpic.py
Proxy Server target Website: http://www.cnproxy.com/proxy1.html
Proxy Server target Website: http://www.cnproxy.com/proxy2.html
Proxy Server target Website: http://www.cnproxy.com/proxy3.html
Proxy Server target Website: http://www.cnproxy.com/proxy4.html
Proxy Server target Website: http://www.cnproxy.com/proxy5.html
Proxy Server target Website: http://www.cnproxy.com/proxy6.html
Proxy Server target Website: http://www.cnproxy.com/proxy7.html
Proxy Server target Website: http://www.cnproxy.com/proxy8.html
...... A total of 800 proxies were captured ..........
...... A total of 458 proxies passed the verification ..........
...... A total of 154 image downloads ..........
# Cat proxy_list.txt | more
173.213.113.111: 3128 United States 0.432188987732
173.213.113.111: 8089 United States 0.441318035126
173.213.113.111: 7808 United States 0.444597005844
110.4.24.170: 80 Hong Kong Mobile Communication Co., Ltd. 0.489440202713
211.142.236.135: 8080 Hunan Zhuzhou mobile 0.490673780441
211.142.236.135: 8081 Hunan Zhuzhou mobile 0.518096923828
211.142.236.135: 8000 Hunan Zhuzhou mobile 0.51860499382
211.142.236.135: 8082 Hunan Zhuzhou mobile 0.520448207855
# Ls
1001117689. jpg 3097883176.jpg 5234319709.jpg 7012274766.jpg 850116248.jpg
1076458640. jpg 3144425522.jpg 5387877704.jpg 7109183143.jpg 867723868.jpg
1198548712. jpg 1091307031.jpg 5572092752.jpg 7361254661.jpg 8746315373.jpg
165738192. jpg 3228008315.jpg 5575388077.jpg 7389537793.jpg 8848973192.jpg
1704512138. jpg 330693%4.jpg 56%40708.jpg 7407358698.jpg 8973834958.jpg
1742167711. jpg 3320152673.jpg 5717429022.jpg 7561176207.jpg 8976862152.jpg
...............