Using IP proxies
Proxyhandler () format IP, first parameter, request target may be HTTP or HTTPS, corresponding setting
Build_opener () Initialize IP
Install_opener () Sets the proxy IP to global and automatically uses proxy IP when Urlopen () requests are used
#!/usr/bin/env python#-*-coding:utf-8-*-import urllibimport urllib.requestimport random #引入随机模块文件ip = " 180.115.8.212:39109 "proxy = Urllib.request.ProxyHandler ({" https ": IP}) #格式化IP, note: The first parameter may be HTTP or HTTPS, Corresponding Settings opener = Urllib.request.build_opener (Proxy,urllib.request.httphandler) #初始化IPurllib. Request.install_ Opener (opener) #将代理IP设置成全局 when using Urlopen () request automatically using proxy ip# request URL = "https://www.baidu.com/" data = Urllib.request.urlopen (URL). read (). Decode ("Utf-8") print (data)
IP Proxy Pool Building a
Suitable for long IP survival time, good stability proxy IP, random call list of IP
#!/usr/bin/env python#-*-coding:utf-8-*-import urllibfrom urllib import requestimport random #引入随机模块文件def dai_li_ IP (): IP = [ ' 110.73.8.103:8123 ', ' 115.46.151.100:8123 ', ' 42.233.187.147:19 ' ] shui = Random.choice (IP) print (shui) proxy = Urllib.request.ProxyHandler ({"https": Shui}) # format IP, note that the first parameter, Request destination may be HTTP or HTTPS, corresponding setting opener = Urllib.request.build_opener (proxy, Urllib.request.HTTPHandler) # Initialize IP Urllib.request.install_opener (opener) # Sets the proxy IP to global and automatically uses proxy ip# requests when using Urlopen () Requests dai_li_ip () #执行代理IP函数url = "https://www.baidu.com/" data = Urllib.request.urlopen (URL). read (). Decode ("Utf-8") print (data)
IP proxy pool build two, interface mode
Each call to a third-party interface dynamically acquires IP, which is suitable for cases where IP survival time is short
We use http://http.zhimaruanjian.com/third-party interface testing
#!/usr/bin/env python#-*-coding:utf-8-*-import urllibfrom urllib import requestimport jsondef dai_li_ip (): URL = " Http://http-webapi.zhimaruanjian.com/getip?num=1&type=2&pro=&city=0&yys=0&port=11&time =1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1 " data = Urllib.request.urlopen (URL)." Read () . Decode ("Utf-8") data2 = json.loads (data) # Restores the string to its original data type print (data2[' data '][0]) IP = str (data2[') Data '][0][' IP ']) Dkou = str (data2[' data '][0][' Port ') zh_ip = IP + ': ' + Dkou print (zh_ip) proxy = Urllib.request.ProxyHandler ({"https": Zh_ip}) # format IP, note that the first parameter, the request target may be HTTP or HTTPS, corresponding settings opener = Urllib.request.build_opener (proxy, Urllib.request.HTTPHandler) # initializes the IP urllib.request.install_opener ( Opener) # Sets the proxy IP to global and automatically uses proxy ip# requests when using Urlopen () Requests dai_li_ip () #执行代理IP函数url = "https://www.baidu.com/" data = Urllib.request.urlopen (URL). read (). Decode ("Utf-8") print (data)
User agent and IP agent combined with application
#!/usr/bin/env python#-*-coding:utf-8-*-import urllibfrom urllib import requestimport jsonimport randomdef yh_dl (): #创建用户代理池 YHDL = [' mozilla/5.0 (Windows; U Windows NT 6.1; En-US) applewebkit/534.50 (khtml, like Gecko) version/5.1 safari/534.50 ', ' mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; trident/5.0 ', ' mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; trident/4.0) ', ' mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) gecko/20100101 firefox/4.0.1 ', ' mozilla/5.0 (Windows NT 6.1; rv:2.0.1) gecko/20100101 firefox/4.0.1 ', ' Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U EN) presto/2.8.131 version/11.11 ', ' opera/9.80 (Windows NT 6.1; U EN) presto/2.8.131 version/11.11 ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Tencenttraveler 4.0) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The world) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1) ', ' mozilla/5.0 (iPhone; U CPU iPhone os 4_3_3 like Mac os X; En-US) applewebkit/533.17.9 (khtml, like Gecko) version/5.0.2 mobile/8j2 safari/6533.18.5 ', ' user-agent:mozilla/5.0 (IPod; U CPU iPhone os 4_3_3 like Mac os X; En-US) applewebkit/533.17.9 (khtml, like Gecko) version/5.0.2 mobile/8j2 safari/6533.18.5 ', ' mozilla/5.0 (IPAD; U CPU os 4_3_3 like Mac os X; En-US) applewebkit/533.17.9 (khtml, like Gecko) version/5.0.2 mobile/8j2 safari/6533.18.5 ', ' mozilla/5.0 (Linux; U Android 2.3.7; En-us; Nexus one build/frf91) applewebkit/533.1 (khtml, like Gecko) version/4.0 Mobile safari/533.1 ', ' opera/9.80 (Android 2.3.4; Linux; Opera mobi/build-1107180945; U EN-GB) presto/2.8.149 version/11.10 ', ' MOZILLA/5.0 (Linux; U Android 3.0; En-us; Xoom build/hri39) applewebkit/534.13 (khtml, like Gecko) version/4.0 safari/534.13 ', ' mozilla/5.0 (BlackBerry; U BlackBerry 9800; EN) applewebkit/534.1+ (khtml, like Gecko) version/6.0.0.337 Mobile safari/534.1+ ', ' mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; trident/5.0; iemobile/9.0; HTC; Titan) ', ' ucweb7.0.2.37/28/999 ', ' nokia5700/ucweb7.0.2.37/28/999 ', ' openwave/ucweb7.0.2.37/28/999 ', ' Mozilla/4.0 (compatible; MSIE 6.0; ) opera/ucweb7.0.2.37/28/999 '] Thisua = Random.choice (YHDL) #随机获取代理信息 headers = ("User-ag Ent ", Thisua) #拼接报头信息 opener = Urllib.request.build_opener () #创建请求对象 Opener.addheaders=[head ERS] #添加报头到请求对象 Urllib.request.install_opener (opener) #将报头信息设置为全局, the Urlopen () method also automatically adds a header d when requested EF dai_li_ip (): #创建ip代理池 url = "Http://http-webapi.zhimaruanjian.com/getip?num=1&type=2&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1 "data = Urllib.request.urlopen (URL). Read () decode ("Utf-8") data2 = json.loads (data) # Restores the string to its original data type print (data2[' data '][0] ) IP = str (data2[' data '][0][' IP ') Dkou = str (data2[' data '][0][' port ') zh_ip = IP + ': ' + Dkou print (ZH_IP) Proxy = Urllib.request.ProxyHandler ({"https": Zh_ip}) # format IP, note that the first parameter, the request target may be HTTP or HTTPS, corresponding settings opener = Urllib.reque St.build_opener (proxy, Urllib.request.HTTPHandler) # Initialize IP urllib.request.install_opener (opener) # To set the proxy IP to global when using the URL The open () request automatically uses the proxy ip# request dai_li_ip () #执行代理IP函数yh_dl () #执行用户代理池函数gjci = ' dress ' zh_gjci = GJC = Urllib.request.quote (GJCI) #将关键词转码成浏览器认识的字符, the default Web site cannot be a Chinese URL = "https://s.taobao.com/search?q=%s&s=0"% (ZH_GJCI) # Print (URL) data = Urllib.request.urlopen (URL). read (). Decode ("Utf-8") print (data)
User agent and IP agent combined with Application encapsulation module
#!/usr/bin/env python#-*-coding:utf-8-*-import urllibfrom urllib import requestimport jsonimport randomimport reimport Urllib.errordef hq_html (Hq_url): "" "hq_html () encapsulated crawler function, automatically enabled the user agent and IP proxy to receive a parameter URL, to crawl the page URL, return HTML source" "" Def YH_DL (): #创建用户代理池 yhdl = [' mozilla/5.0 (Windows; U Windows NT 6.1; En-US) applewebkit/534.50 (khtml, like Gecko) version/5.1 safari/534.50 ', ' mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; trident/5.0 ', ' mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; trident/4.0) ', ' mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) gecko/20100101 firefox/4.0.1 ', ' mozilla/5.0 (Windows NT 6.1; rv:2.0.1) gecko/20100101 firefox/4.0.1 ', ' Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U EN) presto/2.8.131 version/11.11 ', ' opera/9.80 (Windows NT 6.1; U EN) presto/2.8.131 version/11.11 ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0) ', ' Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Tencenttraveler 4.0) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The world) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser) ', ' mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1) ', ' mozilla/5.0 (iPhone; U CPU iPhone os 4_3_3 like Mac os X; En-US) applewebkit/533.17.9 (khtml, like Gecko) version/5.0.2 mobile/8j2 safari/6533.18.5 ', ' User-agent:mozilla /5.0 (IPod; U CPU iPhone os 4_3_3 like Mac os X; En-US) applewebkit/533.17.9 (khtml, like Gecko) version/5.0.2 mobile/8j2 safari/6533.18.5 ', ' mozilla/5.0 (IPAD; U CPU os 4_3_3 like Mac os X; En-US) applewebkit/533.17.9 (khtml, like Gecko) version/5.0.2 mobile/8j2 safari/6533.18.5 ', ' mozilla/5.0 (Linux ; U Android 2.3.7; En-us; Nexus One Build/frf91) applewebkit/533.1 (khtml, like Gecko) version/4.0 Mobile safari/533.1 ', ' opera/9.80 (Android 2.3.4; Linux; Opera mobi/build-1107180945; U EN-GB) presto/2.8.149 version/11.10 ', ' mozilla/5.0 (Linux; U Android 3.0; En-us; Xoom build/hri39) applewebkit/534.13 (khtml, like Gecko) version/4.0 safari/534.13 ', ' mozilla/5.0 (BlackBerry; U BlackBerry 9800; EN) applewebkit/534.1+ (khtml, like Gecko) version/6.0.0.337 Mobile safari/534.1+ ', ' mozilla/5.0 (compatible; M SIE 9.0; Windows Phone OS 7.5; trident/5.0; iemobile/9.0; HTC; Titan) ', ' ucweb7.0.2.37/28/999 ', ' nokia5700/ucweb7.0.2.37/28/999 ', ' openwave/ucweb7.0.2 .37/28/999 ', ' mozilla/4.0 (compatible; MSIE 6.0; ) opera/ucweb7.0.2.37/28/999 '] Thisua = Random.choice (YHDL) #随机获取代理信息 headers = ("User-agent", Thisua) #拼接报头信息 opener = Urllib.request.build_opener () #创建请求对象 Opener.addheaders=[headers] #添加报头到请求对象 Urllib.request.install_opener (opener) #将报头信息设 Set to Global, the Urlopen () method will also automatically add the header def dai_li_ip (Hq_url) when requested: #创建ip代理池 URL = "Http://http-webapi.zhimaruanjian.com/getip ? num=1&type=2&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=1 &sb=0&pb=4&mr=1 "If Url:data = Urllib.request.urlopen (URL)." Read (). Decode ("Utf-8") Data2 = json.loads (data) # Restores the string to its original data type # print (data2[' data '][0]) IP = str (data2[' data '][0][' IP ']) Dkou = str (data2[' data '][0][' port ') zh_ip = IP + ': ' + Dkou pat = "(\w*): \w*" RST = Re.compile (PAT). FindAll (hq_url) #正则匹配获取是http协议还是https协议 rst2 = rst[0] Proxy = Urllib.req Uest. Proxyhandler ({rst2:zh_ip}) # format IP, note that the first parameter, the request target may be HTTP or HTTPS, corresponding settings opener = Urllib.request.build_opener (ProX Y, Urllib.request.HTTPHandler) # EarlyInitialize IP urllib.request.install_opener (opener) # Sets the proxy IP to global and automatically uses proxy IP when Urlopen () requests are used Else:pass #请求 try:dai_li_ip (Hq_url) #执行代理IP函数 yh_dl () #执行用户代理池函数 data = Urllib.request.urlopen (hq_url ). Read (). Decode ("Utf-8") return data except Urllib.error.URLError as E: # If an error occurs if Hasattr (E, "code"): # If there is an error code # print (e.code) # Printing error code pass if Hasattr (E, "Reason"): # If there is an error message # PR int (e.reason) # Print error message pass# a = hq_html (' http://www.baid.com/') # print (a)
Module use
#!/usr/bin/env python#-*-coding:utf-8-*-import urllib.requestimport FZHPACHGJC = ' ad recording ' GJC = Urllib.request.quote (GJC #将关键词转码成浏览器认识的字符, the default Web site cannot be a Chinese URL = ' https://www.baidu.com/s?wd=%s&pn=0 '% (GJC) a = fzhpach.hq_html (URL) print ( A
Eight web crawler explained 2-urllib Library crawler-IP Agent-user agent and IP agent combined application