This article illustrates how Python uses scrapy to crawl site sitemap information. Share to everyone for your reference. Specifically as follows:
Import re from
scrapy.spider import basespider from
scrapy import log to
Scrapy.utils.response import body _or_str from
scrapy.http import Request from
scrapy.selector import Htmlxpathselector
class Sitemapspider (basespider):
name = "Sitemapspider"
start_urls = ["Http://www.domain.com/sitemap.xml"]
def parse (self, Response):
nodename = ' loc '
text = body_or_str (response)
R = Re.compile (r) (<%S[\S>]) (. *?) (</%s>) "% (Nodename,nodename), re. Dotall)
for match in R.finditer (text):
URL = match.group (2)
yield Request (URL, callback=self.parse_ Page
def parse_page (self, Response):
HxS = htmlxpathselector (response)
#Mock item
blah = Item () c20/> #Do all your page parsing and selecting to elemtents you want blash.divtext
= Hxs.select ('//div/text () '). Extrac T () [0]
yield blah
I hope this article will help you with your Python programming.