Python uses a proxy to capture website images (multithreading)

Source: Internet
Author: User

I. Function Description:
1. Capture the proxy server in multiple threads and verify the proxy server in multiple threads
The ps proxy server is crawled from the http://www.cnproxy.com/(the test selects only 8 pages)
2. Capture the image address of a website and use multiple threads to randomly download images from a proxy server.
II. Implementation Code
Copy codeThe Code is as follows:
#! /Usr/bin/env python
# Coding: UTF-8

Import urllib2
Import re
Import threading
Import time
Import random

RawProxyList = []
CheckedProxyList = []
Imgurl_list = []

# Crawling proxy websites
Portdicts = {'V': "3", 'M': "4", 'A': "2", 'L': "9", 'q ': "0", 'B': "5", 'I': "7", 'w': "6", 'r': "8", 'C ': "1 "}
Targets = []
For I in xrange (1, 9 ):
Target = r "http://www.cnproxy.com/proxy%d.html" % I
Targets. append (target)
# Print targets

# Capture Proxy Server Regular Expressions
P = re. compile (r''' <tr> <td> (. + ?) <SCRIPT type = text/javascript> document. write \ (":" \ + (. + ?) \) </SCRIPT> </td> <td> (. + ?) </Td> <td>. +? </Td> <td> (. + ?) </Td> </tr> ''')

# Obtain the proxy class
Class ProxyGet (threading. Thread ):
Def _ init _ (self, target ):
Threading. Thread. _ init _ (self)
Self.tar get = target

Def getProxy (self ):
Print "Proxy Server target Website:" + self.tar get
Req = urllib2.urlopen(self.tar get)
Result = req. read ()
# Print chardet. detect (result)
Matchs = p. findall (result)
For row in matchs:
Ip = row [0]
Port = row [1]
Port = map (lambda x: portdicts [x], port. split ('+ '))
Port = ''. join (port)
Agent = row [2]
Addr = row [3]. decode ("cp936"). encode ("UTF-8 ")
Proxy = [ip, port, addr]
# Print proxy
RawProxyList. append (proxy)

Def run (self ):
Self. getProxy ()

# Verify the proxy class
Class ProxyCheck (threading. Thread ):
Def _ init _ (self, proxyList ):
Threading. Thread. _ init _ (self)
Self. proxyList = proxyList
Self. timeout = 5
Self. testUrl = "http://www.baidu.com /"
Self. testStr = "030173"

Def checkProxy (self ):
Cookies = urllib2.HTTPCookieProcessor ()
For proxy in self. proxyList:
ProxyHandler = urllib2.ProxyHandler ({"http": r 'HTTP: // % s: % s' % (proxy [0], proxy [1])})
# Print r 'HTTP: // % s: % s' % (proxy [0], proxy [1])
Opener = urllib2.build _ opener (cookies, proxyHandler)
Opener. addheaders = [('user-agent', 'mozilla/5.0 (Windows NT 6.2; WOW64; rv: 22.0) Gecko/20100101 Firefox/22.0 ')]
# Urllib2.install _ opener (opener)
T1 = time. time ()

Try:
# Req = urllib2.urlopen ("http://www.baidu.com", timeout = self. timeout)
Req = opener. open (self. testUrl, timeout = self. timeout)
# Print "urlopen is OK ...."
Result = req. read ()
# Print "read html ...."
Timeused = time. time ()-t1
Pos = result. find (self. testStr)
# Print "pos is % s" % pos

If pos> 1:
CheckedProxyList. append (proxy [0], proxy [1], proxy [2], timeused ))
# Print "OK ip: % s" % (proxy [0], proxy [1], proxy [2], timeused)
Else:
Continue
Except t Exception, e:
# Print e. message
Continue

Def run (self ):
Self. checkProxy ()

# Image address retrieval Function
Def imgurlList (url_home ):
Global imgurl_list
Home_page = urllib2.urlopen (url_home)
Url_re = re. compile (R' <li> <a href = "(. + ?) "Target =" _ blank "rel =" nofollow "> ')
Pic_re = re. compile (R' Url_list = re. findall (url_re, home_page.read ())
For url in url_list:
# Print url_home + url
Url_page = urllib2.urlopen (url_home + url)
For imgurlList in re. findall (pic_re, url_page.read ()):
Imgurl_list.append (imgurlList)

# Download image class
Class getPic (threading. Thread ):
Def _ init _ (self, imgurl_list ):
Threading. Thread. _ init _ (self)
Self. imgurl_list = imgurl_list
Self. timeout = 5
Def downloadimg (self ):
For imgurl in self. imgurl_list:
Pic_suffix = imgurl. split ('.') [-1] # obtain the image suffix
Pic_name = str (random. randint (0, 0000000000) + '.' + pic_suffix
Cookies = urllib2.HTTPCookieProcessor ()
RandomCheckedProxy = random. choice (checkedProxyList) # randomly retrieve a group of proxy servers
ProxyHandler = urllib2.ProxyHandler ({"http": r 'HTTP: // % s: % s' % (randomCheckedProxy [0], randomCheckedProxy [1])})
Opener = urllib2.build _ opener (cookies, proxyHandler)
Opener. addheaders = [('user-agent', 'mozilla/5.0 (Windows NT 6.2; WOW64; rv: 22.0) Gecko/20100101 Firefox/22.0 ')]
Urllib2.install _ opener (opener)
Try:
Data_img = opener. open (imgurl, timeout = self. timeout)
F = open (pic_name, 'wb ')
F. write (data_img.read ())
F. close ()
Except t:
Continue
Def run (self ):
Self. downloadimg ()

If _ name _ = "_ main __":
GetThreads = []
CheckThreads = []
ImgurlList ('HTTP: // www.ivsky.com ')
GetPicThreads = []

# Enable a thread for each target website to capture the proxy
For I in range (len (targets )):
T = ProxyGet (targets [I])
GetThreads. append (t)

For I in range (len (getThreads )):
GetThreads [I]. start ()

For I in range (len (getThreads )):
GetThreads [I]. join ()

Print '.' * 10 + "% s proxies captured in total" % len (rawProxyList) + '.' * 10

# Enable 20 threads for verification, and divide the captured proxy into 20 copies. Each thread verifies one copy.
For I in range (20 ):
T = ProxyCheck (rawProxyList [(len (rawProxyList) + 19)/20) * I :( (len (rawProxyList) + 19)/20) * (I + 1)])
CheckThreads. append (t)

For I in range (len (checkThreads )):
CheckThreads [I]. start ()

For I in range (len (checkThreads )):
CheckThreads [I]. join ()

Print '.' * 10 + "% s proxies passed the verification" % len (checkedProxyList) + '.' * 10

# Enable 20 threads to randomly fetch a proxy to download images
For I in range (20 ):
T = getPic (imgurl_list [(len (imgurl_list) + 19)/20) * I :( (len (imgurl_list) + 19)/20) * (I + 1)])
GetPicThreads. append (t)

For I in range (len (getPicThreads )):
GetPicThreads [I]. start ()

For I in range (len (getPicThreads )):
GetPicThreads [I]. join ()

Print '.' * 10 + "% s total image downloads" % len (imgurl_list) + '.' * 10

# Proxy sorting persistence
F = open ("proxy_list.txt", 'W + ')
For proxy in sorted (checkedProxyList, cmp = lambda x, y: cmp (x [3], y [3]):
# Print "checked proxy is: % s \ t % s" % (proxy [0], proxy [1], proxy [2], proxy [3])
F. write ("% s: % s \ t % s \ n" % (proxy [0], proxy [1], proxy [2], proxy [3])
F. close ()

Ii. Test results:
Copy codeThe Code is as follows:
# Ls
Proxy_getpic.py
# Python proxy_getpic.py
Proxy Server target Website: http://www.cnproxy.com/proxy1.html
Proxy Server target Website: http://www.cnproxy.com/proxy2.html
Proxy Server target Website: http://www.cnproxy.com/proxy3.html
Proxy Server target Website: http://www.cnproxy.com/proxy4.html
Proxy Server target Website: http://www.cnproxy.com/proxy5.html
Proxy Server target Website: http://www.cnproxy.com/proxy6.html
Proxy Server target Website: http://www.cnproxy.com/proxy7.html
Proxy Server target Website: http://www.cnproxy.com/proxy8.html
...... A total of 800 proxies were captured ..........
...... A total of 458 proxies passed the verification ..........
...... A total of 154 image downloads ..........
# Cat proxy_list.txt | more
173.213.113.111: 3128 United States 0.432188987732
173.213.113.111: 8089 United States 0.441318035126
173.213.113.111: 7808 United States 0.444597005844
110.4.24.170: 80 Hong Kong Mobile Communication Co., Ltd. 0.489440202713
211.142.236.135: 8080 Hunan Zhuzhou mobile 0.490673780441
211.142.236.135: 8081 Hunan Zhuzhou mobile 0.518096923828
211.142.236.135: 8000 Hunan Zhuzhou mobile 0.51860499382
211.142.236.135: 8082 Hunan Zhuzhou mobile 0.520448207855
# Ls
1001117689. jpg 3097883176.jpg 5234319709.jpg 7012274766.jpg 850116248.jpg
1076458640. jpg 3144425522.jpg 5387877704.jpg 7109183143.jpg 867723868.jpg
1198548712. jpg 1091307031.jpg 5572092752.jpg 7361254661.jpg 8746315373.jpg
165738192. jpg 3228008315.jpg 5575388077.jpg 7389537793.jpg 8848973192.jpg
1704512138. jpg 330693%4.jpg 56%40708.jpg 7407358698.jpg 8973834958.jpg
1742167711. jpg 3320152673.jpg 5717429022.jpg 7561176207.jpg 8976862152.jpg
...............

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.