This example describes how Python uses Scrapy to crawl Web site sitemap information. Share to everyone for your reference. Specific as follows:
Import refrom scrapy.spider import basespiderfrom scrapy import logfrom scrapy.utils.response import Body_or_strfrom SCRA Py.http Import requestfrom scrapy.selector import Htmlxpathselectorclass sitemapspider (basespider): name = " Sitemapspider "Start_urls = [" Http://www.domain.com/sitemap.xml "] def parse (self, Response): nodename = ' loc ' Text = BODY_OR_STR (response) R = Re.compile (r "(<%s[\s>]) (. *?) (
) "% (Nodename,nodename), re. Dotall) for match in R.finditer (text): URL = match.group (2) yield Request (URL, callback=self.parse_page) def parse_page (self, Response): HxS = htmlxpathselector (response) #Mock Item blah = Item () #Do All Your page parsing and selecting the elemtents you want blash.divtext = Hxs.select ('//div/text () '). Extract () [0] yield blah
Hopefully this article will help you with Python programming.