Python uses proxies to crawl Web images (multithreading)

Python uses proxies to crawl Web images (multithreading) _python

Last Update:2017-01-19 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

First, function Description:
1. Multithreading way to crawl proxy server, and multithreading authentication proxy server
PS Proxy Server is crawled from http://www.cnproxy.com/(test selected only 8 pages)
2. Crawl a site's picture address, multithreading random take a proxy server download pictures
Second, the implementation of the Code

Copy Code code as follows:

#!/usr/bin/env python
#coding: Utf-8

Import Urllib2
Import re
Import threading
Import time
Import Random

Rawproxylist = []
Checkedproxylist = []
Imgurl_list = []

#抓取代理网站
Portdicts ={' V ': "3", ' m ': "4", ' A ': "2", ' L ': "9", ' Q ': "0", ' B ': "5", ' I ': "7", ' W ': "6", ' R ': "8", ' C ': "1"}
Targets = []
For I in Xrange (1,9):
target = r "http://www.cnproxy.com/proxy%d.html"% i
Targets.append (target)
#print targets

#抓取代理服务器正则
p = re.compile (R ' ' ' <tr><td> (. +?) <script type=text/javascript>document.write\ (":" \+ (. +?) \) </SCRIPT></td><td> (. +?) </td><td>.+?</td><td> (. +?) </td></tr> "")

#获取代理的类
Class Proxyget (threading. Thread):
def __init__ (Self,target):
Threading. Thread.__init__ (self)
Self.target = target

def getproxy (self):
Print proxy server target Web site: + self.target
req = Urllib2.urlopen (self.target)
result = Req.read ()
#print Chardet.detect (Result)
Matchs = P.findall (Result)
For row in Matchs:
IP=ROW[0]
Port =row[1]
Port = map (lambda x:portdicts[x],port.split (' + '))
Port = '. Join (Port)
Agent = row[2]
Addr = Row[3].decode ("cp936"). Encode ("Utf-8")
Proxy = [IP,PORT,ADDR]
#print Proxy
Rawproxylist.append (proxy)

def run (self):
Self.getproxy ()

#检验代理的类
Class Proxycheck (threading. Thread):
def __init__ (self,proxylist):
Threading. Thread.__init__ (self)
Self.proxylist = Proxylist
Self.timeout = 5
Self.testurl = "http://www.baidu.com/"
Self.teststr = "030173"

    def checkproxy (self):
        cookies = urllib2. Httpcookieprocessor ()
        for proxy in self.proxylist:
             Proxyhandler = urllib2. Proxyhandler ({"http": R ' http://%s:%s '% (Proxy[0],proxy[1])})
             #print R ' http://%s:%s '% (proxy[0],proxy[1])
             opener = Urllib2.build_opener (cookies,proxyhandler)
             opener.addheaders = [(' User-agent ', ' mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) gecko/20100101 firefox/22.0 ')]
            Urllib2.install_opener (opener)
            t1 = Time.time ()

Try
#req = Urllib2.urlopen ("http://www.baidu.com", Timeout=self.timeout)
req = Opener.open (Self.testurl, Timeout=self.timeout)
#print "Urlopen is ok ..."
result = Req.read ()
#print "Read html ..."
timeused = Time.time ()-T1
pos = Result.find (SELF.TESTSTR)
#print "Pos is%s"%pos

If pos > 1:
Checkedproxylist.append ((proxy[0],proxy[1],proxy[2],timeused))
#print ' OK IP:%s%s%s '% (proxy[0],proxy[1],proxy[2],timeused)
Else
Continue
Except Exception,e:
#print E.message
Continue

def run (self):
Self.checkproxy ()

#获取图片地址函数
def imgurllist (url_home):
Global Imgurl_list
Home_Page = Urllib2.urlopen (url_home)
Url_re = Re.compile (R ' <li><a href= "(. +?)" target= "_blank" rel= "nofollow" > ")
Pic_re = Re.compile (R ' Url_list = Re.findall (Url_re,home_page.read ())
For URL in url_list:
#print Url_home+url
Url_page = Urllib2.urlopen (Url_home+url)
For imgurllist in Re.findall (Pic_re,url_page.read ()):
Imgurl_list.append (Imgurllist)

#下载图片的类
Class Getpic (threading. Thread):
def __init__ (self,imgurl_list):
Threading. Thread.__init__ (self)
Self.imgurl_list = Imgurl_list
Self.timeout = 5
def downloadimg (self):
For Imgurl in Self.imgurl_list:
Pic_suffix = Imgurl.split ('. ') [-1] #获取图片后缀
Pic_name = str (random.randint (0,10000000000)) + '. ' +pic_suffix
cookies = Urllib2. Httpcookieprocessor ()
Randomcheckedproxy = Random.choice (checkedproxylist) #随机取一组代理服务器
Proxyhandler = Urllib2. Proxyhandler ({"http": R ' http://%s:%s '% (Randomcheckedproxy[0],randomcheckedproxy[1])})
Opener = Urllib2.build_opener (Cookies,proxyhandler)
Opener.addheaders = [(' User-agent ', ' mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) gecko/20100101 firefox/22.0 ')]
Urllib2.install_opener (opener)
Try
data_img = Opener.open (imgurl,timeout=self.timeout)
f = open (Pic_name, ' WB ')
F.write (Data_img.read ())
F.close ()
Except
Continue
def run (self):
Self.downloadimg ()

if __name__ = = "__main__":
Getthreads = []
Checkthreads = []
Imgurllist (' http://www.ivsky.com ')
Getpicthreads = []

#对每个目标网站开启一个线程负责抓取代理
For I in range (len (targets)):
t = Proxyget (Targets[i])
Getthreads.append (t)

For I in range (len (getthreads)):
Getthreads[i].start ()

For I in range (len (getthreads)):
Getthreads[i].join ()

print '. ' *10+ "A total of%s agent"%len (rawproxylist) + '. ' *10

#开启20个线程负责校验, the crawled agent is divided into 20 parts, each thread verifies one copy
For I in range (20):
t = Proxycheck (rawproxylist[(Len (rawproxylist) +19)/20) * I: (len (rawproxylist) +19)/20) * (i+1)])
Checkthreads.append (t)

For I in range (len (checkthreads)):
Checkthreads[i].start ()

For I in range (len (checkthreads)):
Checkthreads[i].join ()

print '. ' *10+ "There is a total of%s agents through the checksum"%len (checkedproxylist) + '. ' *10

#开启20个线程随机取一个代理下载图片
For I in range (20):
t = getpic (imgurl_list[(Len (imgurl_list) +19)/20) * I: (len (imgurl_list) +19)/20) * (i+1)])
Getpicthreads.append (t)

For I in range (len (getpicthreads)):
Getpicthreads[i].start ()

For I in range (len (getpicthreads)):
Getpicthreads[i].join ()

print '. ' *10+ "A total of%s pictures download"%len (imgurl_list) + '. ' *10

#代理排序持久化
F= Open ("Proxy_list.txt", ' w+ ')
For proxy in sorted (Checkedproxylist,cmp=lambda x,y:cmp (x[3],y[3)):
#print "Checked proxy is:%s:%s\t%s\t%s"% (Proxy[0],proxy[1],proxy[2],proxy[3])
F.write ("%s:%s\t%s\t%s\n"% (proxy[0],proxy[1],proxy[2],proxy[3))
F.close ()

Second, test results:

Copy Code code as follows:

# ls
proxy_getpic.py
# python proxy_getpic.py
Proxy Server Destination Web site: http://www.cnproxy.com/proxy1.html
Proxy Server Destination Web site: http://www.cnproxy.com/proxy2.html
Proxy Server Destination Web site: http://www.cnproxy.com/proxy3.html
Proxy Server Destination Web site: http://www.cnproxy.com/proxy4.html
Proxy Server Destination Web site: http://www.cnproxy.com/proxy5.html
Proxy Server Destination Web site: http://www.cnproxy.com/proxy6.html
Proxy Server Destination Web site: http://www.cnproxy.com/proxy7.html
Proxy Server Destination Web site: http://www.cnproxy.com/proxy8.html
.......... A total of 800 agents were crawled ...
.......... A total of 458 agents through the calibration ....
.......... A total of 154 pictures to download ...
# Cat Proxy_list.txt | More
173.213.113.111:3128 United States 0.432188987732
173.213.113.111:8089 United States 0.441318035126
173.213.113.111:7808 United States 0.444597005844
110.4.24.170:80 Hong Kong Mobile Communications Limited 0.489440202713
211.142.236.135:8080 Hunan Province Zhuzhou Mobile 0.490673780441
211.142.236.135:8081 Hunan Province Zhuzhou Mobile 0.518096923828
211.142.236.135:8000 Hunan Province Zhuzhou Mobile 0.51860499382
211.142.236.135:8082 Hunan Province Zhuzhou Mobile 0.520448207855
# ls
1001117689.jpg 3097883176.jpg 5234319709.jpg 7012274766.jpg 8504924248.jpg
1076458640.jpg 3144369522.jpg 5387877704.jpg 7106183143.jpg 867723868.jpg
1198548712.jpg 3161307031.jpg 5572092752.jpg 7361254661.jpg 8746315373.jpg
165738192.jpg 3228008315.jpg 5575388077.jpg 7389537793.jpg 8848973192.jpg
1704512138.jpg 3306931164.jpg 5610740708.jpg 7407358698.jpg 8973834958.jpg
1742167711.jpg 3320152673.jpg 5717429022.jpg 7561176207.jpg 8976862152.jpg
...............

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python uses proxies to crawl Web images (multithreading) _python

Contact Us

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support