This site is relatively simple, so the first example of a crawler
code is as follows:
#-*-Coding:utf-8-*-"Created on June 12, 2017 get dynamic IP information from the domestic high stealth proxy IP website @see: HTTP://WWW.XICIDAILI.COM/NN/1 @author: Dzm ' ' Import sys reload (SYS) sys.setdefaultencoding (' UTF8 ') import scrapy from pyquery import pyquery as PQ from Eie.middlewa Res import udf_config from eie.items import Eieipitem logger = Udf_config.logger class Ipxicidailispider (scrapy. Spider): Name= "Ip_xicidaili" allowed_domains = ["xicidaili.com"] start_urls = ["Http://www.xicidaili. Com/nn "] def parse (self, Response): Logger.debug (response.url) # Request first page yield scrapy.
Request (response.url+ '/1 ', Callback=self.parse_item, Dont_filter=true) # requests additional pages soup = PQ (response.body)
Pagesum = Soup ('. Pagination A:nth-last-child (2) '). Text () Logger.debug (' Pagesum is%s ', pagesum);
If Pagesum:for I in range (2, int (pagesum)): url = response.url+ '/' + str (i) Yield scrapy. Request (URL, callback=self. Parse_item) def parse_item (self,response): Logger.debug (' The URL that starts crawling now is%s ', response.url);
Soup = PQ (response.body) TRS = Soup (' #ip_list tr ') if trs:for I in range (2, trs.length): TR = Trs.eq (i) if tr: # More than 3s agent, and inventory time for hours, minutes of filtering out life = TR (' Td:eq (8) '). Text () if Self.is_valid_time (life=life): Speed = tr (' Td:eq (6) & Gt
Div '). attr (' title ') Speed = self.filter_speed (speed);
If speed < 3: # agents with speeds exceeding 3s are considered too slow to consider item = Eieipitem ()
item[' IP ' = TR (' TD '). EQ (1). Text () item[' port '] = TR (' TD '). EQ (2). Text () item[' type ' = TR (' TD '). EQ (5). Text () item[' life '] = Self.filter_life (l IFE) item[' speed '] = sPeed yield Item def filter_speed (self,speed): "Filter speed
' Speed = speed.replace (U ' s ', ') return float (speed) def filter_life (self,life): " Filter survival time ' life = life.replace (U ' Day ', ' ") Return to Lives def Is_valid_tim E (Self,life): ' Determine if the time is valid ' ' If Life.rfind (U ') ' >=0 or Life.rfind (u ' Time ') >=0:return False Else:return True
The
Scrapy is an asynchronous crawl, so you can see from the following log that the crawled Web site is not returned in order.