> Record a more complete crawler-forbidden processing via IP pools
Class Httpproxymiddleware (object): # Some anomalies are summarized Exceptions_to_change = (defer. Timeouterror, Timeouterror, Connectionrefusederror, Connecterror, Connectionlost, Tcptimedouterror, ConnectionDone def __init__ (self): # link Database decode_responses set out encoded as str Self.redis = Redis.from_url (' redis://: your password @l ocalhost:6379/0 ', decode_responses=true) pass Def process_request (self, request, spider): #拿出全部key, randomly selected Takes a key value pair keys = Self.rds.hkeys ("Xila_hash") key = Random.choice (keys) #用eval函数转换为dict proxy = eval (Self.rds.hget ("Xila_hash", Key)) logger.warning ("-----------------" +str (proxy) + " In the trial------------------------") #将代理ip and key are stored mate request.meta[" proxy "= proxy[" IP "] request.meta[
"Accounttext"] = key def process_response (self, request, response, spider): Http_status = Response.Status #根据response的状态判断, 200 if IP times +1 is rewritten to the database, returns response to the next link if http_Status = = 200:key = request.meta["Accounttext"] proxy = eval (self.rds.hget ("Xila_hash", Key))
Proxy["times"] = proxy[' Times '] + 1 self.rds.hset ("Xila_hash", Key,proxy) return response #403有可能是因为user-agent is not available, and proxy IP independent, return request can elif http_status = = 403:logging.warning ("############ ############ #403重新请求中 ############################ ") return Request.replace (dont_filter=true) #其他情况姑且被判 Fixed IP is not available, times less than 10, deleted, is greater than or equal to 10 of the temporary retention Else:ip = request.meta["proxy" key = request.meta["Acco
Unttext "] Proxy = eval (self.rds.hget (" Xila_hash ", key)) if proxy[" Times "< 10:
Self.rds.hdel ("Xila_hash", Key) logging.warning ("#################" + IP + "unavailable, deleted ########################")
Return Request.replace (dont_filter=true) def process_exception (self, request, exception, Spider): #其他一些timeout之类异常判断后的处理, IP is not available to delete if Isinstance (exception, self. Exceptions_to_change) \ and Request.meta.get (' proxy ', False): key = request.meta["Accounttext" ] Print ("+++++++++++++++++++++++++{}" is not available +++ will be deleted ++++++++++++++++++++++++ ". Format (key)) Proxy = eval (
Self.rds.hget ("Xila_hash", key)) if proxy["Times"] < 10:self.rds.hdel ("Xila_hash", key) Logger.debug ("Proxy {} link error {}".)
Format (request.meta[' proxy ', exception)) return Request.replace (dont_filter=true)