Replace agents and HTML headers for your own crawlers

Source: Internet
Author: User

Import requests
Import re
Import Random
Import time


Class Download ():

def __init__ (self):

Self.iplist = [] # #初始化一个list用来存放我们获取到的IP
html = requests.get ("http://haoip.cc/tiqu.htm") # #不解释咯
IPLISTN = Re.findall (r ' r/> (. *?) <b ', html.text, Re. S) # #表示从html. Text to get all the content in the r/><b, Re. s means including matches including line breaks, FindAll returns a list oh!
For IP in Iplistn:
i = Re.sub (' \ n ', ' ', Ip) # #re. Sub is the replacement method of the re module, which means to replace \ n with an empty
Self.iplist.append (i.strip ()) # #添加到我们上面初始化的list里面

Self.user_agent_list = [
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/22.0.1207.1 safari/537.1 ",
"mozilla/5.0 (X11; CrOS i686 2268.111.0) applewebkit/536.11 (khtml, like Gecko) chrome/20.0.1132.57 safari/536.11 ",
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.6 (khtml, like Gecko) chrome/20.0.1092.0 safari/536.6 ",
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.6 (khtml, like Gecko) chrome/20.0.1090.0 safari/536.6",
"mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/19.77.34.5 safari/537.1 ",
"mozilla/5.0 (X11; Linux x86_64) applewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.9 safari/536.5 ",
"mozilla/5.0 (Windows NT 6.0) applewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.36 safari/536.5",
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ",
"mozilla/5.0 (Windows NT 5.1) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3",
"mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ",
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3",
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3 ",
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3",
"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3 ",
"mozilla/5.0 (Windows NT 6.1) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3",
"mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.0 safari/536.3",
"mozilla/5.0 (X11; Linux x86_64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 ",
"mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 "
]

def get (self, url, timeout, proxy=none, num_retries=6): # #给函数一个默认参数proxy为空
UA = Random.choice (self.user_agent_list) # #从self. User_agent_list randomly take out a string
headers = {' user-agent ': ua} # #构造成一个完整的User-agent (ua stands for the string that was randomly taken above)

If proxy = = None: # #当代理为空时, do not use a proxy to get response (don't forget what response!). Said it before!! )
Try
return requests.get (url, headers=headers, Timeout=timeout) # #这样服务器就会以为我们是真的浏览器了
except:# #如过上面的代码执行报错则执行下面的代码

If num_retries > 0: # #num_retries是我们限定的重试次数
Time.sleep () # #延迟十秒
Print (u ' Get page error, 10S will get countdown: ', num_retries, u ')
return self.get (url, timeout, num_retries-1) # #调用自身 and reduce the number of times by 1
Else
Print (u ' Start using proxy ')
Time.sleep (10)
IP = '. Join (str (random.choice (self.iplist)). strip ()) # #下面有解释哦
Proxy = {' http ': IP}
return self.get (url, timeout, proxy,) # #代理不为空的时候

Else: # #当代理不为空
Try
IP = ". Join (str (random.choice (self.iplist)). strip ()) # #将从self. IPList Gets the string processed into the format we need (what we're dealing with, See for yourself, that's the Basis)
Proxy = {' http ': IP} # #构造成一个代理
return requests.get (url, headers=headers, proxies=proxy, Timeout=timeout) # #使用代理获取response
Except

If Num_retries > 0:
Time.sleep (10)
IP = '. Join (str (random.choice (self.iplist)). strip ())
Proxy = {' http ': IP}
Print (u ' is replacing agent, 10S will regain Countdown ', num_retries, u ')
Print (u ' s current agent is: ', Proxy)
return self.get (url, timeout, proxy, num_retries-1)
Else
The print (u ' agent is not good!) Cancel Agent ')
return Self.get (url, 3)

request = Download () # #

Replace agents and HTML headers for your own crawlers

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.