Copider imitate scrapy Some of the wording, of course, I this is a single process, not asynchronous
1. Catalogue copider/copider.py
#Coding=utf-8" "Created on October 8, 2015 @author:snt1" "ImportUrllib2Importlxml.htmlImportStringioclassSpider (object):def __init__(Self, URL, meta=None): Self. URL=URL self. META=Meta self. Textmark=self.get (URL) self. SEL= Self.selector (doc=Self . Textmark)defget (self, url):Try: Req=Urllib2. Request (URL) req.add_header ('user-agent','mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) applewebkit/537.36 (khtml, like Gecko) chrome/44.0.2403.155 safari/537.36') shtml= Urllib2.urlopen (req, timeout=15). Read ()exceptException, E:PrintE"... next ."Data=Stringio.stringio (shtml) HTML=Data.read ()return(HTML)#return HTML@propertydefhtml (self):returnSelf . Textmark @propertydefURL (self):returnSelf . URL @propertydefMeta (self):returnSelf . METAdefSelector (self, doc=None):ifdoc:html=DocElse: HTML=Self . HTMLreturnlxml.html.fromstring (HTML)defXPath (self, rule): Iter_list=Self . Sel.xpath (rule) attrlist= [] Try: forEleinchiter_list:attrList.append (Ele.attrib)#attrlist.append (Ele.attrib) returnattrlistexceptException, E:returniter_listdefRequest (URL, func, * *Meta):ifMeta:response=spider (url,meta['Meta']) Else: Response=Spider (URL) func (response)
2.copider/aero.py
#Coding=utf-8" "Created on October 8, 2015 @author:snt1" "ImportReImport Time fromCopiderImportSpider, RequestclassAerocopider (object): Name="Aero"storeId="554b14c97b010cc731e81b35" #Site IDAllowed_domains = ["www.xxxx.com"] Root_url='http://www.xxxx.com'Category_url= Root_url +'/category/index.jsp?numresultsperpage=100&categoryid=%s'Cap_category_url= Root_url +'/family/index.jsp?categoryid=%s&page=%d&numresultsperpage=100'url_dicts= {'3534623':'Girls','3534624':'Guys'} def __init__(self): Self.start_urls ()defStart_urls (self): forFidinchself.url_dicts.keys (): URL= Self.category_url%FID Response=Spider (URL) node_a= Response.xpath ('//*[@id = "Sidebar-left"]/div/dl[2]/dd//dt/a/@href') Node_text= Response.xpath ('//*[@id = "Sidebar-left"]/div/dl[2]/dd//dt/a/text ()') url_list, Cid_list= [],[] forNum, preparinginchEnumerate (node_a): Parttern= Re.compile (r'family.jsp\?categoryid=') ifParttern.search (preparing): Chd_url= self.root_url+Preparing Pattern_sub= Re.compile ('&cp=.*?$') Chd_url= Pattern_sub.sub ("', Chd_url, Re. S|re. i|Re. M) Pattern_fin= Re.compile (r'family.jsp\?categoryid= (\d+)') CID=Pattern_fin.findall (Chd_url) [0] Url_list.append (chd_url) cid_list.append (CID) Print(U'Product Category Links:%s, %s'%(Node_text[num], chd_url)) Cateid=Cid_list[num] Request (Chd_url, self.parse_page, Meta={'Cateid': Cateid}) Print defparse_page (Self, Response):#total_page = Response.xpath ('//div[@class = "pagination"]/ul/li/a[@rel = "nofollow"]/text () ')total_items = Int (Response.xpath ('//*[@id = "Main-wrap"]//li[@class = "Count"]/span/text ()') [0]) mod, REM= Divmod (Total_items, 100) ifMoD > 1: ifrem >0:mod+ = 1Else: MoD= 1Total_page=MoDPrint(U'Products Total number of pages:%s, %s'%(Total_page,response.url)) Cateid= response.meta['Cateid'] forPageinchRange (1, total_page+1): URL= Self.cap_category_url%(Cateid, page) Request (URL, self.parse_product)defparse_product (Self, Response): Product= Response.xpath ('//*[@id = "Products"]//h4/a/@href') Print(U'from which page:%s'%Response.url)Print(U'Product:%s, path:%s'%(len (product), product))if __name__=='__main__': Aerocopider ()
Write yourself a reptile copider