# new py File: duplication.py
# We've created a new file specifically for weight. In the Scrapy source code has already written the structure, we just copy and paste it over from Scrapy.dupefilter import Basedupefilter "Class Basedupefilter (object): @ Classmethod def from_settings (CLS, settings): Return CLS () def request_seen (self, request): Return Fal Se def open (self): # can return deferred pass def close (self, Reason): # can return a deferred pass def log (self, request, spider): # Log This a request has been filtered pass ' # can see that the above is scrapy in Basedupefilter this class, the frame structure helps us to set up, so we only need to customize the following class Dupefilter (object): # using the constructor method, or using the previous filter method Def __init__ (self): Self.urls = Set () @classmethod def from_settings (CLS, settings): Return CLS () def request_seen (self, request): # this In the Request.url is the URL we crawl # if it's inside the collection, then it returns true, meaning it succeeded without having to crawl if request.url in Self.urls:return True # no longer set inside return false, meaning error, bug has not crawled this URL self.urls.add (Request.url) return False def open (self): # start Pass DEF cLose (self, Reason): # End Pass def log (self, request, spider): # log pass# can see the class method under @classmethod, directly return to CL S (), which is very common in scrapy, so we don't have to instantiate # Scrapy will call this method automatically, generate an instance object, so we just need to write the appropriate structure to
Main program:
#-*-Coding:utf-8-*-import scrapyfrom scrapy.http import requestclass getchoutispider (scrapy. Spider): name = ' Get_chouti ' allowed_domains = [' chouti.com '] start_urls = [' https://dig.chouti.com/'] # # when recursion Parse is performed repeatedly when it is found, so md5_urls cannot be defined in the parse function # Md5_urls = set () # adds the URL to the collection, is our own custom method, in fact scrapy for us to prepare a better way to reset def par SE (Self, Response): # By returning the results, we can see that the scrapy helps us go back to print (Response.url) "Https://dig.chouti.c OM/HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/2 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/10 Https://dig.chou TI.COM/ALL/HOT/RECENT/8 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/6 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/9 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/4 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/5 Https://dig.chouti.com/al L/HOT/RECENT/7 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/3 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/1 https: DIG.CHOUTI.COM/ALL/HOT/RECENT/11 https://DIG.CHOUTI.COM/ALL/HOT/RECENT/12 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/14 Https://dig.chouti.com/all/hot/rec ENT/13 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/18 https://dig.chouti.com/all/hot/recent/16 Https://dig. CHOUTI.COM/ALL/HOT/RECENT/17 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/15 https://dig.chouti.com/all/hot/recent/ HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/20 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/21 Https://dig.chou TI.COM/ALL/HOT/RECENT/23 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/25 https://dig.chouti.com/all/hot/recent/24 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/27 Https://dig.chouti.com/all/hot/recent/29 HTTPS://DIG.CHOUTI.C OM/ALL/HOT/RECENT/26 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/28 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/22 Https://dig.chouti.com/all/hot/recent/30 Https://dig.chouti.com/all/hot/recent/33 https://dig.chouti.com/a Ll/hot/recent/31HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/32 Https://dig.chouti.com/all/hot/recent/34 Https://dig.chouti.com/all /HOT/RECENT/37 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/36 https://dig.chouti.com/all/hot/recent/41 http S://DIG.CHOUTI.COM/ALL/HOT/RECENT/38 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/40 Https://dig.chouti.com/all/hot /RECENT/39 https://dig.chouti.com/all/hot/recent/45 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/42 https:// DIG.CHOUTI.COM/ALL/HOT/RECENT/44 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/43 Https://dig.chouti.com/all/hot/rec ENT/49 https://dig.chouti.com/all/hot/recent/47 https://dig.chouti.com/all/hot/recent/46 Https://dig. CHOUTI.COM/ALL/HOT/RECENT/48 https://dig.chouti.com/all/hot/recent/50 https://dig.chouti.com/all/hot/recent/ HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/51 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/52 Https://dig.chou Ti.com/all/hot/recent/56 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/57 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/55 Https://dig.chouti . COM/ALL/HOT/RECENT/35 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/54 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/59 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/60 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/61 https://dig.chouti.com /ALL/HOT/RECENT/58 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/62 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/63 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/64 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/65 Https://dig.chouti.com/all /HOT/RECENT/66 https://dig.chouti.com/all/hot/recent/67 https://dig.chouti.com/all/hot/recent/68 http S://DIG.CHOUTI.COM/ALL/HOT/RECENT/69 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/70 Https://dig.chouti.com/all/hot /RECENT/71 https://dig.chouti.com/all/hot/recent/73 https://dig.chouti.com/all/hot/recent/72 https:// Dig.chouti.com/all/hot/rECENT/74 https://dig.chouti.com/all/hot/recent/76 https://dig.chouti.com/all/hot/recent/75 Https://di G.CHOUTI.COM/ALL/HOT/RECENT/77 https://dig.chouti.com/all/hot/recent/78 Https://dig.chouti.com/all/hot/recen T/79 https://dig.chouti.com/all/hot/recent/80 https://dig.chouti.com/all/hot/recent/81 https://dig.ch OUTI.COM/ALL/HOT/RECENT/82 https://dig.chouti.com/all/hot/recent/83 https://dig.chouti.com/all/hot/recent/84 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/85 https://dig.chouti.com/all/hot/recent/86 Https://dig.chouti . com/all/hot/recent/87 https://dig.chouti.com/all/hot/recent/88 https://dig.chouti.com/all/hot/recent/89 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/90 https://dig.chouti.com/all/hot/recent/92 https://dig.chouti.com /ALL/HOT/RECENT/91 https://dig.chouti.com/all/hot/recent/93 https://dig.chouti.com/all/hot/recent/94 https://dig.chouti.com/aLL/HOT/RECENT/97 HTTPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/95 https://dig.chouti.com/all/hot/recent/96 HT TPS://DIG.CHOUTI.COM/ALL/HOT/RECENT/98 https://dig.chouti.com/all/hot/recent/99 https://dig.chouti.com/all/h ot/recent/100 https://dig.chouti.com/all/hot/recent/101 https://dig.chouti.com/all/hot/recent/102 htt ps://dig.chouti.com/all/hot/recent/103 https://dig.chouti.com/all/hot/recent/104 https://dig.chouti.com/all/ hot/recent/105 https://dig.chouti.com/all/hot/recent/108 https://dig.chouti.com/all/hot/recent/106 HT tps://dig.chouti.com/all/hot/recent/107 https://dig.chouti.com/all/hot/recent/109 Https://dig.chouti.com/all /hot/recent/111 https://dig.chouti.com/all/hot/recent/110 https://dig.chouti.com/all/hot/recent/112 H ttps://dig.chouti.com/all/hot/recent/113 https://dig.chouti.com/all/hot/recent/114 Https://dig.chouti.com/al l/hot/recent/115https://dig.chouti.com/all/hot/recent/116 https://dig.chouti.com/all/hot/recent/117 https://dig.chouti.com/a ll/hot/recent/120 https://dig.chouti.com/all/hot/recent/118 https://dig.chouti.com/all/hot/recent/119 "# How are we going to weigh here? Create a new file that defines a class Res2 = Response.xpath ('//div[@id = ' dig_lcpage ']//a/@href '). Extract () for URLs in Res2: # Everything is ok between the # URL = "https://dig.chouti.com%s"% URL yield Request (Url=url, Callback=self.parse)
Configuration file:
Depth_limit = 0# Of course in the configuration file, you must specify, filter the use of the class # in order to use our defined classes to filter dupefilter_class = ' Chouti.duplication.DupeFilter '
(5). Go to heavy URLs, crawl and de-re-detach