This article describes a Python custom scrapy intermediate module to avoid duplicate acquisition. Share to everyone for your reference. Specific as follows:
From scrapy import logfrom scrapy.http import requestfrom scrapy.item import baseitemfrom scrapy.utils.request import requ Est_fingerprintfrom myproject.items Import Myitemclass Ignorevisiteditems (object): "" "middleware to ignore re-visiting Item pages If they were already visited before. The requests to is filtered by has a meta[' filter_visited '] flag enabled and optionally define an ID to use for Identi Fying them, which defaults the request fingerprint, although you ' d want for use of the item ID, if you already has it befor Ehand to make it more robust. "" "filter_visited = ' filter_visited ' visited_id = ' visited_id ' context_key = ' visited_ids ' def process_spider_output ( Self, response, result, spider): context = GetAttr (spider, ' context ', {}) Visited_ids = Context.setdefault (self. Context_key, {}) ret = [] for x in result:visited = False if Isinstance (x, Request): if self. filter_visited in x.meta:visit_id = self._visited_id (x)If visit_id in visited_ids:log.msg ("ignoring already visited:%s"% X.url, Level=log.info, spid Er=spider) visited = True elif isinstance (x, baseitem): visit_id = self._visited_id (response.reques T) if visit_id:visited_ids[visit_id] = True x[' visit_id ') = visit_id x[' visit_status '] = ' new ' If Visited:ret.append (myitem (visit_id=visit_id, visit_status= ' old ')) Else:ret.append (x) return ret def _visited_id (self, request): Return Request.meta.get (self. visited_id) or request_fingerprint (request)
Hopefully this article will help you with Python programming.