1. Create a new "middlewares.py" under the Scrapy project
# Importing Base64 library because we ' ll need it only if the the proxy we is going to use requires authenticationimpo RT base64# Start Your middleware classclass Proxymiddleware (object): # Overwrite process request Def process_request (self, Request, Spider): # Set The location of the proxy request.meta[' proxy '] = "Http://YOUR_PROXY_IP:PORT" # use T He following lines if your proxy requires authentication Proxy_user_pass = "Username:password" # Setup Basic authe Ntication for the proxy encoded_user_pass = base64.encodestring (proxy_user_pass) request.headers[' Proxy-authorization '] = ' Basic ' + encoded_user_pass
2. Add in the project configuration file (./project_name/settings.py)
Downloader_middlewares = {' Scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware ':, ' project_ Name.middlewares.ProxyMiddleware ': 100,}
As long as two steps, now the request is through the proxy. Test the ^_^.
From Scrapy.spider import basespiderfrom scrapy.contrib.spiders import crawlspider, rulefrom scrapy.http Import Requestclass Testspider (crawlspider): name = "Test" domain_name = "whatismyip.com" # The following URL is subject to Chang E, you can get the last updated one from here: # http://www.whatismyip.com/faq/automation.asp start_urls = ["Http://xujia N.info "] def parse (self, Response): open (' test.html ', ' WB '). Write (Response.body)
3. Using Random User-agent
By default, Scrapy acquisition can only use a user-agent, which is easily blocked by the site, the following code can be randomly selected from a pre-defined list of user-agent to collect different pages
Add the following code in the settings.py
Downloader_middlewares = { ' scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware ': None, ' Crawler.comm.rotate_useragent.RotateUserAgentMiddleware ': 400}
Note: Crawler; is the name of your project through which is a directory of names below is the code of the spider
#!/usr/bin/python#-*-coding:utf-8-*-import randomfrom scrapy.contrib.downloadermiddleware.useragent Import Useragentmiddlewareclass Rotateuseragentmiddleware (useragentmiddleware): Def __init__ (self, user_agent= "): Self.user_agent = User_agent def process_request (self, request, spider): #这句话用于随机选择user-agent ua = Random.choice (self.us er_agent_list) if Ua:request.headers.setdefault (' User-agent ', UA) #the default user_agent_list composes Chrome,i E,fir Efox,mozilla,opera,netscape #for More user agent strings,you can find it in http://www.useragentstring.com/pages/ useragentstring.php user_agent_list = [\ "mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/22.0.1207.1 safari/537.1 "\" mozilla/5.0 (X11; CrOS i686 2268.111.0) applewebkit/536.11 (khtml, like Gecko) chrome/20.0.1132.57 safari/536.11 ", \" mozilla/5.0 (Windows N T 6.1; WOW64) applewebkit/536.6 (khtml, like Gecko) chrome/20.0.1092.0 safari/536.6 ", \" mozilla/5.0 (Windows NT 6.2) AppleWebKit /536.6 (khtml, like Gecko) chrome/20.0.1090.0 safari/536.6 ", \" mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/537.1 (khtml, like Gecko) chrome/19.77.34.5 safari/537.1 ", \" mozilla/5.0 (X11; Linux x86_64) applewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.9 safari/536.5 ", \" mozilla/5.0 (Windows NT 6.0) Appl ewebkit/536.5 (khtml, like Gecko) chrome/19.0.1084.36 safari/536.5 ", \" mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ", \" mozilla/5.0 (Windows NT 5.1) AppleWebKit /536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ", \" mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1063.0 safari/536.3 ", \" mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3 ", \" mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1062.0 safari/536.3 ", \" mozilla/5.0 (Windows NT 6.2) AppleWebKit /536.3 (khtml, like Gecko)chrome/19.0.1061.1 safari/536.3 ", \" mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3 ", \" mozilla/5.0 (Windows NT 6.1) AppleWebKit /536.3 (khtml, like Gecko) chrome/19.0.1061.1 safari/536.3 ", \" mozilla/5.0 (Windows NT 6.2) applewebkit/536.3 (khtml, lik e Gecko) chrome/19.0.1061.0 safari/536.3 ", \" mozilla/5.0 (X11; Linux x86_64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 ", \" mozilla/5.0 (Windows NT 6.2; WOW64) applewebkit/535.24 (khtml, like Gecko) chrome/19.0.1055.1 safari/535.24 "]