fromurllib.request Import request, Proxyhandler fromurllib.request Import Build_opener fromBS4 Import beautifulsoupimport mysqldb;import redis fromurllib.request Import Urlopen fromlxml Import etree fromlxml import Etreeimport re;urlfront="http://www.xicidaili.com"URL="HTTP://WWW.XICIDAILI.COM/NN/1"result= Redis. Redis (host='127.0.0.1', port=6379, db=0) # def spider_ip (URL): # Get the entire page def get_allcode (URL): # Set proxy IP Proxy= {'HTTPS':'110.73.0.45:8123'} proxy_support=Proxyhandler (proxy); Opener=Build_opener (Proxy_support) # Sets the Access HTTP protocol header, simulates the browser opener.addheaders= [ ('user-agent','mozilla/5.0 (Windows; U Windows NT 6.1; En-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6')] R=opener.open (URL) HTML= R.read (). Decode ("UTF-8"); # print (HTML)returnstr (HTML) # lxml way Get Ipdef find_ip (s): # s=get_allcode (URL); Selector=etree. HTML (s); Links= Selector.xpath ('//tr[@class = "odd"]/td/text () |//tr[@class = ""]/td/text ()'); IP=[] Port=[] forLinkinchLinks: # print (link)if '-' inchLink: # print () pass Elif link.isdigit (): Port.append (link) # F.WR ITE (Link+'\ n'); Elif'.' inchlink:ip.append (link) # f.write (link+':'); # Use Redis's llist to save IP forIinchRange (len (IP)): # Print (Ip[i]+":"+Port[i]) IPs=ip[i] +":"+Port[i] Result.lpush ('MyList', IPS) def get_next_page (s): Selecter=etree. HTML (s); Link= Selecter.xpath ('//div[@class = "pagination"]/a[@class = "Next_page"]/@href'); forIinchLink:ifi = =None:returnNone; returnUrlfront +idef get_allcode_ip (URL,IP): # Set Proxy IPTry: IP=str (IP, encoding="Utf-8"# bytes and str convert to timeout=5Proxy= {'http': IP} proxy_support=Proxyhandler (proxy); Opener=Build_opener (Proxy_support) # Sets the Access HTTP protocol header, simulates the browser opener.addheaders= [ ('user-agent','mozilla/5.0 (Windows; U Windows NT 6.1; En-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6')] # plus run timeout R=Opener.open (url,none,timeout) HTML= R.read (). Decode ("UTF-8"); Print ('+++++++++++++++'# Place the available IP in the Redis useable_ip Result.lpush ('usable_ip', IP) print (IP) print ('+++++++++++++++') except Exception asErr:print (Err) while 1: Print (URL) s=get_allcode (URL); URL=Get_next_page (s) print (URL)ifurl==None: BreakFind_ip (s) while 1: IP= Result.lpop ('MyList') print (IP)ifIP = =None: Breakget_allcode_ip (URL, IP)
PYTHON_DAY06 (IP proxy pool)