Or the site of the previous blog, we added pipeline.py
items.py
From Scrapy.item Import Item, Fieldclass Website (item): Name = field () Description = field () url = field ()
dmoz.py
from scrapy.spider import spiderfrom scrapy.selector import selectorfrom Dirbot.items import websiteclass dmozspider (Spider): name = " Dmoz " allowed_domains = [" dmoz.org "] start_urls = [ "http://www.dmoz.org/Computers/Programming/Languages/ python/books/", " http://www.dmoz.org/Computers/Programming/ Languages/python/resources/", ] def parse (self, Response): "" " The lines below is a spider contract. for more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name "" sel = selector (response) sites = sel.xpath ('//ul[@class = ' directory-url ']/li ') items = [] for site in sites: item = website () item[' name '] = site.xpath (' a /text () '). Extract () item[' URL '] = site.xpath (' a @href '). Extract () item[' description '] =&NBsp;site.xpath (' text () '). Re ('-\s[^\n]*\\r ') items.append (item) return items
Note that the XPath of description differs from the previous one, where spaces and line breaks are removed
pipeline.py
From scrapy.exceptions import dropitemclass filterwordspipeline (object): "" "A pipeline for filtering out items which contain certain words in their description "" " # put all words in lowercase words_to_filter = [' Politics ', ' Religion '] def process_item (self, item, spider): for word in self.words_to_filter: if word in unicode (item[' description '). Lower (): raise dropitem ( "contains forbidden word: %s" % word) Else: &nbSp; return item
Role: Filter description contains ' politics ' or ' religion ' item
Python crawler Frame Scrapy Learning Note 5-------filter sensitive words using pipelines