Urllib the application of the basic module, which obtains the HTML document information in the URL, the internal can override the agent's acquisition method
Copy the Code code as follows:
Class Proxyscrapy (object):
def __init__ (self):
Self.proxy_robot = Proxyrobot ()
Self.current_proxy = None
Self.cookie = Cookielib. Cookiejar ()
def __builder_proxy_cookie_opener (self):
Cookie_handler = Urllib2. Httpcookieprocessor (Self.cookie)
handlers = [Cookie_handler]
If proxy_enable:
Self.current_proxy = Ip_port = Self.proxy_robot.get_random_proxy ()
Proxy_handler = Urllib2. Proxyhandler ({' http ': ip_port[7:]})
Handlers.append (Proxy_handler)
Opener = Urllib2.build_opener (*handlers)
Urllib2.install_opener (opener)
Return opener
def get_html_body (Self,url):
Opener = Self.__builder_proxy_cookie_opener ()
Request=urllib2. Request (URL)
#request. Add_header ("accept-encoding", "GZIP,DEFLATE,SDCH")
#request. Add_header ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
#request. Add_header ("Cache-control", "No-cache")
#request. Add_header ("Connection", "keep-alive")
Try
Response = Opener.open (request,timeout=2)
Http_code = Response.getcode ()
if Http_code = = 200:
If proxy_enable:
Self.proxy_robot.handle_success_proxy (Self.current_proxy)
html = Response.read ()
return HTML
Else
If proxy_enable:
Self.proxy_robot.handle_double_proxy (Self.current_proxy)
return Self.get_html_body (URL)
Except Exception as Inst:
Print Inst,self.current_proxy
Self.proxy_robot.handle_double_proxy (Self.current_proxy)
return Self.get_html_body (URL)