Scrapy Frame crawl An expression website emoticon "Source +gif Emoticons Package Download"
Import Scrapy
Import Os,sys
Import requests
Import re
Class Scrapyone (Scrapy. Spider):
Name = "Stackone"
Start_urls = ["http://qq.yh31.com/ql/bd/"]
Def parse (self,response):
Hrf=response.xpath ('//*[@id = "main_bblm"]/div[2]/dl/dd/li ')
For Li in HRF:
item={}
Href=li.xpath (' a @href '). Extract ()
Hreftext=li.xpath (' A/text () '). Extract ()
Full_url = ' http://qq.yh31.com ' + '. Join (list (HREF))
Hreftext= '. Join (list (Hreftext))
#文件夹名称
If hreftext== ' > More > ':
Continue
Path = ' C:\GIF '
If not os.path.exists (path):
Os.makedirs (PATH)
item[' dirname ']=hreftext
Yield scrapy. Request (url=full_url,meta={' key ': Item},callback = self.parse1)
def parse1 (self,response):
ite={}
Full_url=[]
URL1 = Response.xpath ('//*[@id = "pe100_page_infolist"]/a[2]/@href '). Extract ()
Url2 = Response.xpath ('//*[@id = "pe100_page_infolist"]/a[2]/@href '). Re (' \d+ ')
URL1 = ". Join (URL1)
URL1 = Url1.split ('_')
URL2 = ". Join (URL2)
ite[' Dirn ']=response.meta[' key ' [' dirname ']
For I in range (1,int (URL2) +1):
Full_url= ' http://qq.yh31.com ' +url1[0]+ ' _ ' +str (i) + '. html '
#print (Full_url)
Yield scrapy. Request (url=full_url,meta={' key1 ': Ite},callback = self.parse2)
def parse2 (self,response):
p1=response.meta[' Key1 ' [' Dirn ']
RESP = Response.xpath ('//*[@id = ' main_bblm ']/div[1]/li/dt/a ')
Path = ' C:\GIF\\ ' + '. Join (p1)
If not os.path.exists (path):
Os.makedirs (PATH)
For LST in RESP:
alt = Lst.xpath (' img/@alt '). Extract ()
src = Lst.xpath (' img/@src '). Extract ()
src = ' http://qq.yh31.com ' + '. Join (list (SRC))
alt = '. Join (list (ALT))
Html=requests.get (SRC)
With open (path+ ' \ \ ' +alt+ '. gif ', ' WB ') as file:
File.write (html.content)
Script execution Mode:cmd--> switch to the directory where the script is located-->scrapy runspider xxxx.py
After execution, the corresponding folder is automatically created according to the GIF classification in the C:\gif folder to store the GIF image.