E:\M\F1>CD. \
E:\m>scrapy Startproject Qsauto
New scrapy project ' Qsauto ', using template directory ' d:\\users\\administrator\\appdata\\local\\programs\\python\\ Python36-32\\lib\\site-packages\\scrapy\\templates\\project ', created in:
E:\m\qsauto
You can start your first spider with:
CD Qsauto
Scrapy Genspider Example example.com
E:\M>CD qsauto/
E:\m\qsauto>scrapy genspider-l
Available templates:
Basic
Crawl
Csvfeed
Xmlfeed
E:\m\qsauto>scrapy genspider-t Crawl Weisuen qiushibaike.com
weisuen.py:
#-*-Coding:utf-8-*-
Import Scrapy
From scrapy.linkextractors import Linkextractor
From scrapy.spiders import Crawlspider, Rule
From scrapy.http import Request
From Qsauto.items import Qsautoitem
Class Weisuenspider (Crawlspider):
name = ' Weisuen '
Allowed_domains = [' qiushibaike.com ']
‘‘‘
Start_urls = [' http://www.qiushibaike.com/']
‘‘‘
Rules = (
Rule (Linkextractor (allow= ' article '), callback= ' Parse_item ', follow=true),
)
def start_requests (self):
UA = {
"User-agent": ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/49.0.2623.22 safari/537.36 SE 2.X METASR 1.0 '}
Yield Request (' http://www.qiushibaike.com/', Headers=ua)
def parse_item (self, Response):
i = Qsautoitem ()
#i [' domain_id '] = Response.xpath ('//input[@id = "Sid"]/@value '). Extract ()
#i [' name '] = Response.xpath ('//div[@id = "name"]). Extract ()
#i [' description '] = Response.xpath ('//div[@id = "description"]). Extract ()
i["Content"]=response.xpath ("//div[@class = ' content ']/text ()"). Extract ()
i["link"]=response.xpath ('//a[@class = ' contentherf ']/@href '). Extract ()
Print (i["content"])
Print (i["link"])
Print ("")
return I
Web crawling (PLUS10) Scrapy 4