This example describes the Python custom Scrapy intermediate module to avoid duplication of collection methods. Share to everyone for your reference. as follows:
From scrapy import log to scrapy.http import Request from Scrapy.item import baseitem from scrapy.utils.request Import Request_fingerprint from Myproject.items Import myitem Class Ignorevisiteditems (object): "" "middleware to ignore Re-visi
Ting Item pages If they were already visited before. The requests to is filtered by have a meta[' filter_visited '] flag enabled and optionally define Tifying them, which defaults the request fingerprint, although you ' d want to use the item ID, if you already have it b
Eforehand to make it more robust. "" "filter_visited = ' filter_visited ' visited_id = ' visited_id ' context_key = ' visited_ids ' def process_spider_out Put (self, response, result, spider): context = GetAttr (spider, ' context ', {}) Visited_ids = Context.setdefault (sel F.context_key, {}) ret = [] for x in result:visited = False if Isinstance (x, Request): If SEL f.filter_visited in x.meta:visit_id = selF._VISITED_ID (x) if visit_id in visited_ids:log.msg ("ignoring already visited:%s"% X.url, Level=log.info, spider=spider) visited = True elif isinstance (x, baseitem): visit_id = SELF._VISITED_ID (Response.request) if visit_id:visited_ids[visit_id] = True x[' visit_id '] = V isit_id x[' visit_status '] = ' new ' If Visited:ret.append (myitem (visit_id=visit_id, visit_status= ' O LD ') else:ret.append (x) return ret def _visited_id (self, request): Return Request.meta.get (self. visited_id) or request_fingerprint (request)
I hope this article will help you with your Python programming.