Urllib the application of the basic module, through this class to obtain the HTML document information in the URL, the internal can rewrite the method of obtaining the proxy
Copy Code code as follows:
Class Proxyscrapy (object):
def __init__ (self):
Self.proxy_robot = Proxyrobot ()
Self.current_proxy = None
Self.cookie = Cookielib. Cookiejar ()
def __builder_proxy_cookie_opener (self):
Cookie_handler = Urllib2. Httpcookieprocessor (Self.cookie)
handlers = [Cookie_handler]
If proxy_enable:
Self.current_proxy = Ip_port = Self.proxy_robot.get_random_proxy ()
Proxy_handler = Urllib2. Proxyhandler ({' http ': ip_port[7:]})
Handlers.append (Proxy_handler)
Opener = Urllib2.build_opener (*handlers)
Urllib2.install_opener (opener)
Return opener
def get_html_body (Self,url):
Opener = Self.__builder_proxy_cookie_opener ()
Request=urllib2. Request (URL)
#request. Add_header ("accept-encoding", "GZIP,DEFLATE,SDCH")
#request. Add_header ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
#request. Add_header ("Cache-control", "No-cache")
#request. Add_header ("Connection", "keep-alive")
Try
Response = Opener.open (request,timeout=2)
Http_code = Response.getcode ()
If Http_code = 200:
If proxy_enable:
Self.proxy_robot.handle_success_proxy (Self.current_proxy)
html = Response.read ()
return HTML
Else
If proxy_enable:
Self.proxy_robot.handle_double_proxy (Self.current_proxy)
return Self.get_html_body (URL)
Except Exception as Inst:
Print Inst,self.current_proxy
Self.proxy_robot.handle_double_proxy (Self.current_proxy)
return Self.get_html_body (URL)