#!/usr/bin/env python#Coding=utf-8#Dairu#http://www.linuxyw.comImportReImportRandomImportSYSImport TimeImportdatetimeImportThreading fromRandomImportChoiceImportRequestsImportBS4defget_ip ():"""Get proxy IP"""URL="Http://www.xicidaili.com/nn"Headers= {"Accept":"Text/html,application/xhtml+xml,application/xml;", "accept-encoding":"gzip, deflate, SDCH", "Accept-language":"zh-cn,zh;q=0.8,en;q=0.6", "Referer":"http://www.xicidaili.com", "user-agent":"mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/42.0.2311.90 safari/537.36"} R= Requests.get (url,headers=headers) Soup= BS4. BeautifulSoup (R.text,'Html.parser') Data= Soup.table.find_all ("TD") Ip_compile= Re.compile (r'<td> (\d+\.\d+\.\d+\.\d+) </td>')#Matching IPPort_compile = Re.compile (r'<td> (\d+) </td>')#Matching Portsip = Re.findall (ip_compile,str (data))#get all IPsPort = Re.findall (PORT_COMPILE,STR (data))#Get all Ports return[":". Join (i) forIinchZip (Ip,port)]#combine ip+ ports, such as: 115.112.88.23:8080#set up the User-agent list, and on each request, you can randomly pick a user-agnet in this listUAS = [ "mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; baiduspider-ads) gecko/17.0 firefox/17.0", "mozilla/5.0 (Windows; U Windows NT 5.1; ZH-CN; RV:1.9B4) gecko/2008030317 firefox/3.0b4", "mozilla/5.0 (Windows; U MSIE 6.0; Windows NT 5.1; SV1;. NET CLR 2.0.50727; Bidubrowser 7.6)", "mozilla/5.0 (Windows NT 6.3; WOW64; trident/7.0; rv:11.0) Like Gecko", "mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) gecko/20100101 firefox/46.0", "mozilla/5.0 (Windows NT 6.3; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/45.0.2454.99 safari/537.36", "mozilla/5.0 (Windows NT 6.3; Win64; x64; trident/7.0; Touch; LCJB; rv:11.0) Like Gecko", ]defGet_url (code=0,ips=[]): """vote if the vote fails because the proxy IP is not available, the proxy IP will be replaced automatically after you continue to vote""" Try: IP=Choice (IPs)except: returnFalseElse: Proxies= { "http": IP,} headers2= {"Accept":"Text/html,application/xhtml+xml,application/xml;", "accept-encoding":"gzip, deflate, SDCH", "Accept-language":"zh-cn,zh;q=0.8,en;q=0.6", "Referer":"", "user-agent": Choice (UAS),}Try: Num= Random.uniform (0,1) Hz_url="http://www.xxxxx.com/xxxx%s"% num#the address of a polling site, where there is no real domain nameHz_r = Requests.get (hz_url,headers=headers2,proxies=proxies)exceptRequests.exceptions.ConnectionError:Print "Connectionerror" if notIPs:Print "Not IP"sys.exit ()#Remove an unavailable proxy IP ifIpinchips:ips.remove (IP)#re-request URLGet_url (code,ips)Else: Date= Datetime.datetime.now (). Strftime ('%h:%m:%s') PrintU"%s times [%s] [%s]: poll%s (number of available proxy IP:%s)"%(Code,date,ip,hz_r.text,len (IPS)) IPs= [] forIinchXrange (6000): #regain the latest proxy IP every 1000 times and get the latest 100 proxy IPs at a time ifI% 1000 = =0:ips.extend (Get_ip ())#enable thread, generate a thread in 1 seconds, control time to speed up the voting, the smallest unit of time.sleep is millisecondsT1 = Threading. Thread (target=get_url,args=(i,ips)) T1.start () Time.sleep (1)
Https://www.linuxyw.com/806.html
Python crawler: Auto-voting code (auto-crawl proxy IP)