This article mainly introduces the Python implementation of the crawler set proxy IP and disguised as a browser method to share, has a certain reference value, now share to everyone, the need for friends can refer to
1.python Crawler Browser Camouflage
#导入urllib. Request module Import urllib.request# Set the header headers= ("User-agent", "mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/49.0.2623.221 safari/537.36 SE 2.X METASR 1.0 ") #创建一个openeropener = Urllib.request.build_opener () #将headers添加到opener中opener. addheaders=[headers] #将opener安装为全局urllib. Request.install _opener (opener) #用urlopen打开网页data =urllib.request.urlopen (URL). read (). Decode (' utf-8 ', ' ignore ')
2. Setting up the agent
#定义代理ipproxy_addr = "122.241.72.191:808" #设置代理proxy =urllib.request.proxyhandle ({' http ':p roxy_addr}) # Create a Openeropener=urllib.request.build_opener (proxy,urllib.request.httphandle) # Install opener as a global Urllib.request.install_opener (opener) #用urlopen打开网页data =urllib.request.urlopen (URL). read (). Decode (' Utf-8 ', ' ignore ')
3. Simultaneous setup with proxy and simulated browser access
#定义代理ipproxy_addr = "122.241.72.191:808" #创建一个请求req =urllib.request.request (URL) #添加headersreq. Add_header (" User-agent "," mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) #设置代理proxy =urllib.request.proxyhandle ("http":p roxy_addr) # Create a Openeropener=urllib.request.build_opener (proxy,urllib.request.httphandle) # Install opener as a global Urllib.request.install_opener (opener) #用urlopen打开网页data =urllib.request.urlopen (req). Read (). Decode (' Utf-8 ', ' ignore ')
4. Add more information to the request header
Import urllib.requestpage_headers={"user-agent": "mozilla/5.0 (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/49.0.2623.221 safari/537.36 SE 2.X METASR 1.0 ", " Host ":" Www.baidu.com ", " Cookie ":" xxxxxxxx " }req=urllib.request.request (url,headers=page_headers) data= Urllib.request.urlopen (req). Read (). Decode (' utf-8 ', ' ignore ')
5. Add POST Request parameters
Import Urllib.requestimport urllib.parse# Set the Post parameters Page_data=urllib.parse.urlencode ([' PN ', page_num), (' kd ', keywords)]) #设置headerspage_headers ={' user-agent ': ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/49.0.2623.221 safari/537.36 SE 2.X METASR 1.0 ', ' Connection ': ' Kee P-alive ', ' Host ': ' www.lagou.com ', ' Origin ': ' https://www.lagou.com ', ' Cookie ': ' jsessionid=abaaabaabeeaaja8f28c0 0a88dc4d771796bb5c6ffa2dda; user_trace_token=20170715131136-d58c1f22f6434e9992fc0b35819a572b ', ' Accept ': ' Application/json, Text/javascript, * /*; q=0.01 ', ' content-type ': ' application/x-www-form-urlencoded; Charset=utf-8 ', ' Referer ': ' https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98?labelWords= &fromsearch=true&suginput= ', ' x-anit-forge-token ': ' None ', ' x-requested-with ': ' XMLHttpRequest '} #打开网页re Q=urllib.request.request (url,headers=page_headers) data=urllib.requesT.urlopen (Req,data=page_data.encode (' Utf-8 ')). Read (). Decode (' Utf-8 ')
6. Using PHANTOMJS to simulate browser requests
#1. Download Phantomjs install to local and set environment variable from selenium import webdriverbs=webdriver. PHANTOMJS () #打开urlbs. Get (URL) #获取网页源码url_data =bs.page_source# to save the browsed page as a picture bs.get_screenshot_as_file (filename)
7.phantomjs setting User-agent and Cookies
From selenium import webdriverfrom selenium.webdriver.common.desired_capabilities Import Desiredcapabilitiesdcap = Dict (DESIREDCAPABILITIES.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ("mozilla/5.0" (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/49.0.2623.221 safari/537.36 SE 2.X METASR 1.0 ") bs = Webdriver. PHANTOMJS (desired_capabilities=dcap) bs.get (URL) #删除cookiebs. delete_all_cookies () #设置cookie # Cookie format: Viewed in a browser cookie, a cookie needs to contain the following parameters, domain, name, value, pathcookie={ ' domain ': '. www.baidu.com ', #注意前面有. ' Name ': ' xxxx ', ' value ': ' xxxx ', ' path ': ' xxxx ' } #向phantomjs中添加cookiebs. Add_cookie (Cookie)
8. Using the Web_driver tool
#1. Download the Web_driver tool (such as Chromdriver.exe) and the corresponding browser # #. Put Chromdriver.exe in a directory, such as C:\chromdriver.exefrom selenium import Webdriverdriver=webdriver. Chrome (executable_path= "C:\chromdriver.exe") #打开urldriver. Get (URL)