#!/usr/bin/env python
#coding: Utf-8
Import Urllib2
Import re
Import threading
Import time
Import Random
Rawproxylist = []
Checkedproxylist = []
Imgurl_list = []
#抓取代理网站
Portdicts ={' V ': "3", ' m ': "4", ' A ': "2", ' L ': "9", ' Q ': "0", ' B ': "5", ' I ': "7", ' W ': "6", ' R ': "8", ' C ': "1"}
Targets = []
For I in Xrange (1,9):
target = r "http://www.cnproxy.com/proxy%d.html"% i
Targets.append (target)
#print targets
#抓取代理服务器正则
p = re.compile (R ' ' ' <tr><td> (. +?) <script type=text/javascript>document.write\ (":" \+ (. +?) \) </SCRIPT></td><td> (. +?) </td><td>.+?</td><td> (. +?) </td></tr> "")
#获取代理的类
Class Proxyget (threading. Thread):
def __init__ (Self,target):
Threading. Thread.__init__ (self)
Self.target = target
def getproxy (self):
Print proxy server target Web site: + self.target
req = Urllib2.urlopen (self.target)
result = Req.read ()
#print Chardet.detect (Result)
Matchs = P.findall (Result)
For row in Matchs:
IP=ROW[0]
Port =row[1]
Port = map (lambda x:portdicts[x],port.split (' + '))
Port = '. Join (Port)
Agent = row[2]
Addr = Row[3].decode ("cp936"). Encode ("Utf-8")
Proxy = [IP,PORT,ADDR]
#print Proxy
Rawproxylist.append (proxy)
def run (self):
Self.getproxy ()
#检验代理的类
Class Proxycheck (threading. Thread):
def __init__ (self,proxylist):
Threading. Thread.__init__ (self)
Self.proxylist = Proxylist
Self.timeout = 5
Self.testurl = "http://www.baidu.com/"
Self.teststr = "030173"
def checkproxy (self):
cookies = urllib2. Httpcookieprocessor ()
for proxy in self.proxylist:
Proxyhandler = urllib2. Proxyhandler ({"http": R ' http://%s:%s '% (Proxy[0],proxy[1])})
#print R ' http://%s:%s '% (proxy[0],proxy[1])
opener = Urllib2.build_opener (cookies,proxyhandler)
opener.addheaders = [(' User-agent ', ' mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) gecko/20100101 firefox/22.0 ')]
Urllib2.install_opener (opener)
t1 = Time.time ()
Try
#req = Urllib2.urlopen ("http://www.baidu.com", Timeout=self.timeout)
req = Opener.open (Self.testurl, Timeout=self.timeout)
#print "Urlopen is ok ..."
result = Req.read ()
#print "Read html ..."
timeused = Time.time ()-T1
pos = Result.find (SELF.TESTSTR)
#print "Pos is%s"%pos
If pos > 1:
Checkedproxylist.append ((proxy[0],proxy[1],proxy[2],timeused))
#print ' OK IP:%s%s%s '% (proxy[0],proxy[1],proxy[2],timeused)
Else
Continue
Except Exception,e:
#print E.message
Continue
def run (self):
Self.checkproxy ()
#获取图片地址函数
def imgurllist (url_home):
Global Imgurl_list
Home_Page = Urllib2.urlopen (url_home)
Url_re = Re.compile (R ' <li><a href= "(. +?)" target= "_blank" rel= "nofollow" > ")
Pic_re = Re.compile (R ' Url_list = Re.findall (Url_re,home_page.read ())
For URL in url_list:
#print Url_home+url
Url_page = Urllib2.urlopen (Url_home+url)
For imgurllist in Re.findall (Pic_re,url_page.read ()):
Imgurl_list.append (Imgurllist)
#下载图片的类
Class Getpic (threading. Thread):
def __init__ (self,imgurl_list):
Threading. Thread.__init__ (self)
Self.imgurl_list = Imgurl_list
Self.timeout = 5
def downloadimg (self):
For Imgurl in Self.imgurl_list:
Pic_suffix = Imgurl.split ('. ') [-1] #获取图片后缀
Pic_name = str (random.randint (0,10000000000)) + '. ' +pic_suffix
cookies = Urllib2. Httpcookieprocessor ()
Randomcheckedproxy = Random.choice (checkedproxylist) #随机取一组代理服务器
Proxyhandler = Urllib2. Proxyhandler ({"http": R ' http://%s:%s '% (Randomcheckedproxy[0],randomcheckedproxy[1])})
Opener = Urllib2.build_opener (Cookies,proxyhandler)
Opener.addheaders = [(' User-agent ', ' mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) gecko/20100101 firefox/22.0 ')]
Urllib2.install_opener (opener)
Try
data_img = Opener.open (imgurl,timeout=self.timeout)
f = open (Pic_name, ' WB ')
F.write (Data_img.read ())
F.close ()
Except
Continue
def run (self):
Self.downloadimg ()
if __name__ = = "__main__":
Getthreads = []
Checkthreads = []
Imgurllist (' http://www.ivsky.com ')
Getpicthreads = []
#对每个目标网站开启一个线程负责抓取代理
For I in range (len (targets)):
t = Proxyget (Targets[i])
Getthreads.append (t)
For I in range (len (getthreads)):
Getthreads[i].start ()
For I in range (len (getthreads)):
Getthreads[i].join ()
print '. ' *10+ "A total of%s agent"%len (rawproxylist) + '. ' *10
#开启20个线程负责校验, the crawled agent is divided into 20 parts, each thread verifies one copy
For I in range (20):
t = Proxycheck (rawproxylist[(Len (rawproxylist) +19)/20) * I: (len (rawproxylist) +19)/20) * (i+1)])
Checkthreads.append (t)
For I in range (len (checkthreads)):
Checkthreads[i].start ()
For I in range (len (checkthreads)):
Checkthreads[i].join ()
print '. ' *10+ "There is a total of%s agents through the checksum"%len (checkedproxylist) + '. ' *10
#开启20个线程随机取一个代理下载图片
For I in range (20):
t = getpic (imgurl_list[(Len (imgurl_list) +19)/20) * I: (len (imgurl_list) +19)/20) * (i+1)])
Getpicthreads.append (t)
For I in range (len (getpicthreads)):
Getpicthreads[i].start ()
For I in range (len (getpicthreads)):
Getpicthreads[i].join ()
print '. ' *10+ "A total of%s pictures download"%len (imgurl_list) + '. ' *10
#代理排序持久化
F= Open ("Proxy_list.txt", ' w+ ')
For proxy in sorted (Checkedproxylist,cmp=lambda x,y:cmp (x[3],y[3)):
#print "Checked proxy is:%s:%s\t%s\t%s"% (Proxy[0],proxy[1],proxy[2],proxy[3])
F.write ("%s:%s\t%s\t%s\n"% (proxy[0],proxy[1],proxy[2],proxy[3))
F.close ()