Or the Watercress top250 Crawler example, the addition of download middleware, mainly to set up dynamic uesr-agent and proxy IP
Scrapy proxy IP, uesr-agent switching is through DOWNLOADER_MIDDLEWARES
control, we create files in the same settings.py
sibling directory middlewares.py
, packaging all requests.
middlewares.py
#-*-coding:utf-8-*-ImportRandomImportBase64 fromSettingsImportuser_agents fromSettingsImportPROXIESclassrandomuseragent (object):defprocess_request (self, request, spider): useragent=Random.choice (user_agents) Request.headers.setdefault ("user-agent", useragent)classRandomproxy (object):defprocess_request (self, request, spider): Proxy=Random.choice (PROXIES)ifproxy['user_passwd'] isNone:#use of free agents without proxy account verificationrequest.meta['Proxy'] ="http//"+ proxy['Ip_port'] Else: request.meta['Proxy'] ="http//"+ proxy['Ip_port'] #base64 encoding conversion of account passwordsBASE64_USERPASSWD = Base64.b64decode (proxy['user_passwd']) #correspondence to the proxy server in the signaling formatrequest.headers['proxy-authorization'] ='Basic'+ base64_userpasswd
settings.py
User_agents = [ "mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Acoobrowser;. NET CLR 1.1.4322;. NET CLR 2.0.50727)", "mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;. NET CLR 2.0.50727; Media Center PC 5.0;. NET CLR 3.0.04506)", "mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; Aolbuild 4337.35; Windows NT 5.1;. NET CLR 1.1.4322;. NET CLR 2.0.50727)", "mozilla/5.0 (Windows; U MSIE 9.0; Windows NT 9.0; En -US)", "mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; trident/5.0;. NET CLR 3.5.30729;. NET CLR 3.0.30729;. NET CLR 2.0.50727; Media Center PC 6.0)", "mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; trident/4.0; WOW64; trident/4.0; SLCC2;. NET CLR 2.0.50727;. NET CLR 3.5.30729;. NET CLR 3.0.30729;. NET CLR 1.0.3705;. NET CLR 1.1.4322)", "mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2;. NET CLR 1.1.4322;. NET CLR 2.0.50727; infopath.2;. NET CLR 3.0.04506.30)", "mozilla/5.0 (Windows; U Windows NT 5.1; ZH-CN) applewebkit/523.15 (khtml, like Gecko, safari/419.3) arora/0.3 (change:287 c9dfb30)", "mozilla/5.0 (X11; U Linux; En-US) applewebkit/527+ (khtml, like Gecko, safari/419.3) arora/0.6", "mozilla/5.0 (Windows; U Windows NT 5.1; En-us; Rv:1.8.1.2pre) gecko/20070215 k-ninja/2.1.1", "mozilla/5.0 (Windows; U Windows NT 5.1; ZH-CN; rv:1.9) gecko/20080705 firefox/3.0 kapiko/3.0", "mozilla/5.0 (X11; Linux i686; U;) gecko/20070322 kazehakase/0.4.5", "mozilla/5.0 (X11; U Linux i686; En-us; rv:1.9.0.8) Gecko fedora/1.9.0.8-1.fc10 kazehakase/0.5.6", "mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/535.11 (khtml, like Gecko) chrome/17.0.963.56 safari/535.11", "mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) applewebkit/535.20 (khtml, like Gecko) chrome/19.0.1036.7 safari/535.20", "opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U FR) presto/2.9.168 version/11.52"]proxies= [ {'Ip_port':'111.8.60.9:8123','user_passwd':''}, {'Ip_port':'101.71.27.120:80','user_passwd':'User2:pass2'}, {'Ip_port':'122.96.59.104:80','user_passwd':'USER3:PASS3'}, {'Ip_port':'122.224.249.122:8088','user_passwd':'USER4:PASS4'},]downloader_middlewares= {'douban.middlewares.RandomUserAgent': 100,
'Douban.middlewares.RandomProxy': 200,}
Python crawler Framework Scrapy Example (iv) Download middleware settings