Class crawler (SPIDER): Rules = () def _ init _ (self, * a, ** kW): Super (crawler, self ). _ init _ (* a, ** kW) self. _ compile_rules () # first, call parse () to process the response objects returned in start_urls # parse (). Then, these response objects are passed to the _ parse_response () function for processing, and set the callback function to parse_start_url () # Set the follow-up flag to true # parse returns the item and the following request object def parse (self, response): return self. _ parse_response (response, self. parse_start_url, cb_kwargs ={}, follow = tru E) # to process the response returned by start_url, rewrite def parse_start_url (self, response): return [] def process_results (self, response, results ): return results # extract a link conforming to any user-defined 'rule' from response and construct a resquest object to return def _ requests_to_follow (self, response): If not isinstance (response, htmlresponse): Return seen = set () # all links in the extraction, as long as any 'rule' is used, it indicates legal for N, rule in enumerate (self. _ rules): links = [L for L in rule. link_extractor. Extract_links (response) if l not in seen] # Use the process_links specified by the user to process each connection if links and rule. process_links: links = rule. process_links (LINKS) # Add the link to the seen set, generate a request object for each link, and set the callback function to _ repsonse_downloaded () for link in links: Seen. add (Link) # construct the request object and use the callback function defined in the rule as the callback function r = request (url = link. URL, callback = self. _ response_downloaded) R. meta. update (Rule = N, link_text = link. text) # Call PR for each request Ocess_request () function. This function defaults to indentify, that is, the request is directly returned without any processing. yield rule. process_request (r) # process the connection extracted by rule and return the item and request def _ response_downloaded (self, response): Rule = self. _ rules [response. meta ['rule'] Return self. _ parse_response (response, rule. callback, rule. cb_kwargs, rule. follow) # parse the response object and Use callback to parse it and return the request or item object def _ parse_response (self, response, callback, cb_kwargs, follow = true ): # First, determine whether a callback function is set. (This callback function may be a parsing function in rule, or parse_start_url function) # If the callback function (parse_start_url () is set, use parse_start_url () to process the response object first, # submit it to process_results for processing. Returns a list of cb_res if callback: # If it is called by parse, it will be parsed into a request object # If it is rule callback, it will be parsed into item cb_res = callback (response, ** cb_kwargs) or () cb_res = self. process_results (response, cb_res) for requests_or_item in iterate_spider_output (cb_res. _ follow_links: # returns each request object for request_or_item in self. _ requests_to_follow (response): yield request_or_item def _ compile_rules (Self): def get_method (method): If callable (method): Return Method Elif isinstance (method, basestring ): return getattr (self, method, none) self. _ rules = [copy. copy (R) for R in self. rules] For rule in self. _ rules: rule. callback = get_method (rule. callback) rule. process_links = get_method (rule. process_links) rule. process_request = get_method (rule. process_request) def set_crawler (self, crawler): Super (crawler, self ). set_crawler (crawler) self. _ follow_links = crawler. settings. getbool ('crawlspider _ follow_links ', true)