Import requests
Import re
Import Random
Import time
Class Download ():
def __init__ (self):
Self.iplist = [] # #初始化一个list用来存放我们获取到的IP
html = requests.get ("http://haoip.cc/tiqu.htm") # #不解释咯
IPLISTN = Re.findall (r ' r/> (. *?) <b ', html.text, Re. S) # #表示从html. Text to get all the content in the r/><b, Re. s means including matches including line breaks, FindAll returns a list oh!
For IP in Iplistn:
i = Re.sub (' \ n ', ' ', Ip) # #re. Sub is the replacement method of the re module, which means to replace \ n with an empty
Self.iplist.append (i.strip ()) # #添加到我们上面初始化的list里面
Self.user_agent_list = [
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/22.0.1207.1 safari/537.1 ",
"mozilla/5.0 (X11; CrOS i686 2268.111.0) applewebkit/536.11 (khtml, like Gecko) chrome/20.0.1132.57 safari/536.11 ",
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.6 (khtml, like Gecko) chrome/20.0.1092.0 safari/536.6 ",
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.6 (khtml, like Gecko) chrome/20.0.1090.0 safari/536.6",
"mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/19.77.34.5 safari/537.1 ",
"mozilla/5.0 (X11; Linux x86_64) applewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.9 safari/536.5 ",
"mozilla/5.0 (Windows NT 6.0) applewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.36 safari/536.5",
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ",
"mozilla/5.0 (Windows NT 5.1) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3",
"mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ",
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3",
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3 ",
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3",
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3 ",
"mozilla/5.0 (Windows NT 6.1) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3",
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.0 safari/536.3",
"mozilla/5.0 (X11; Linux x86_64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 ",
"mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 "
]
def get (self, url, timeout, proxy=none, num_retries=6): # #给函数一个默认参数proxy为空
UA = Random.choice (self.user_agent_list) # #从self. User_agent_list randomly take out a string
headers = {' user-agent ': ua} # #构造成一个完整的User-agent (ua stands for the string that was randomly taken above)
If proxy = = None: # #当代理为空时, do not use a proxy to get response (don't forget what response!). Said it before!! )
Try
return requests.get (url, headers=headers, Timeout=timeout) # #这样服务器就会以为我们是真的浏览器了
except:# #如过上面的代码执行报错则执行下面的代码
If num_retries > 0: # #num_retries是我们限定的重试次数
Time.sleep () # #延迟十秒
Print (u ' Get page error, 10S will get countdown: ', num_retries, u ')
return self.get (url, timeout, num_retries-1) # #调用自身 and reduce the number of times by 1
Else
Print (u ' Start using proxy ')
Time.sleep (10)
IP = '. Join (str (random.choice (self.iplist)). strip ()) # #下面有解释哦
Proxy = {' http ': IP}
return self.get (url, timeout, proxy,) # #代理不为空的时候
Else: # #当代理不为空
Try
IP = ". Join (str (random.choice (self.iplist)). strip ()) # #将从self. IPList Gets the string processed into the format we need (what we're dealing with, See for yourself, that's the Basis)
Proxy = {' http ': IP} # #构造成一个代理
return requests.get (url, headers=headers, proxies=proxy, Timeout=timeout) # #使用代理获取response
Except
If Num_retries > 0:
Time.sleep (10)
IP = '. Join (str (random.choice (self.iplist)). strip ())
Proxy = {' http ': IP}
Print (u ' is replacing agent, 10S will regain Countdown ', num_retries, u ')
Print (u ' s current agent is: ', Proxy)
return self.get (url, timeout, proxy, num_retries-1)
Else
The print (u ' agent is not good!) Cancel Agent ')
return Self.get (url, 3)
request = Download () # #
Replace agents and HTML headers for your own crawlers