Automatically log in and get PDF files
#!/usr/bin/Env python#-*-encoding:utf-8-*-# Created on --Geneva- A A: .: -# Project:pdf_spiderimport Re fromPyspider.libs.base_handler Import *classHandler (basehandler):GlobalCookie Cookie= {"Tsclub_bb90_saltkey":"xozcc32l","Tsclub_bb90_lastvisit":"1428457605","Tsclub_bb90_visitedfid":"326","tsclub_bb90_ulastactivity":"1428579196%7c0","Tsclub_bb90_auth":"F9F8KCRDAJ3Q9AY9OXESFGE2CZ%2BARVK0GZ5JV%2BQOHYHCTLJEOPEZRXU%2FEBSF6PK%2B754%2FSI5DNB0W%2BMSMLWMVTC3XKWLT ","Tsclub_bb90_lastcheckfeed":"5470207%7c1428579196","Tsclub_bb90_lip":"122.13.84.73%2c1428579196","Tsclub_bb90_nofavfid":"1","PGV_PVI":"8694210858","Pgv_info":"ssi=s5025153920","Hm_lvt_ee0d63d2db0dfbf9e0d399bccbd5fce7":"1428461128,1428578830","Hm_lpvt_ee0d63d2db0dfbf9e0d399bccbd5fce7":"1428581442","tsclub_bb90_lastact":"1428581519%09misc.php%09patch","Tjpctrl":"1428583242081",} headers= { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "accept-encoding":"GZIP;DEFLATE;SDCH", "Accept-language":"zh-cn,zh;en-us;q=0.8", "Cache-control":"No-cache", "user-agent":"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/38.0.2125.122 ubrowser/4.0.4368.102 safari/537.36", "Host":"club.topsage.com", "Pragma":"No-cache", "Refer":"http://club.topsage.com", "Connection":"keep-alive",} crawl_config= { "Headers": Headers,"Timeout": +, "Cookies": Cookie} @every (minutes= -* -) def on_start (self): Self.crawl ('http://club.topsage.com/', callback=self.index_page) @config ( age=Ten* -* -* -) def index_page (self, Response): foreachinchResponse.doc ('a[href^= "http"]'). Items ():if(Re.match ("http://club\.topsage\.com/forum-.+\.html", Each.attr.href, re. U)): Self.crawl (Each.attr.href, Callback=self.forum_page) elif Re.match ("http://club\.topsage\.com/thread-.+\.html", Each.attr.href, re. U): Self.crawl (Each.attr.href, Callback=self.detail_page) @config ( age=Ten* -* -* -, priority=2) def forum_page (self, Response): Response_url=Response.url #print ('forum_page >> Response URL is'+Response_url) foreachinchResponse.doc ('a[href^= "http://club.topsage.com"]'). Items ():#ifEach.attr.href!=response.url:#detail PageifRe.match ("http://club\.topsage\.com/thread-.+\.html", Each.attr.href, re. U): Self.crawl (Each.attr.href, Callback=self.detail_page) #forum Forum page elif Re.match ("http://club\.topsage\.com/forum-.+\.html", Each.attr.href, re. U): Self.crawl (Each.attr.href, Callback=self.forum_page) #next page foreachinchResponse.doc ('html > Body > div > div > div > div > a'). Items (): Self.crawl (Each.attr.href, Callback=self.forum_page) @config ( priority=2) def detail_page (self, Response): Response_url=Response.url Print ('detail_page >> Response URL is'+Response_url) foreachinchResponse.doc ('table TR > td > A'). Items ():if(self.is_url_matched (each.attr.href)): Print ('Attachment URL is'+each.attr.href)return { "Download_url": Each.attr.href,"file_name": Each.text (),} def is_url_matched (self, url):if(Re.match ('^ (HTTP|FTP|HTTPS)://.+\. (ZIP|RAR|TAR|PDF|DOC|DOCX|EXCEL|PPT|PPTX) $', URL, re. U)):returnTrueif(Re.match ('^http://club\.topsage\.com/forum\.php\?mod=attachment.+', URL, re. U)):returnTruereturnFalse
Pyspider Example code VII: Automatically log in and get PDF files