##Author:wuhao##--*------------*--#-**** #爬取代理IP并保存到Excel----
#爬取当日的代理IP并保存到Excel, Target site xicidaili.com
#如果访问频率太快的话, the IP will be blocked by the website
Importurllib.requestImportUrllib.parseImportReImportXLWTImportHttp.cookiejarImportdatetime fromBs4ImportBeautifulSoupImport TimeclassGetproxyip ():def __init__(Self,opener,): Self.opener=openerdefgethtmlpage (self,url): HTML=self.opener.open (URL)returnHtml.read (). Decode ("Utf-8") defcleanhtml (self,html):#Clean the Web page, get IP, port, type, whether anonymous, server addressip=[] Port=[] server_addr=[] is_niming=[] Type=[] time=[] Soup=beautifulsoup (HTML,"Html.parser") #print (soup) Try: Ip_table=soup.find ("Table", id="ip_list") Ip_result=ip_table.find_all ("TR") forIinchRange (1,len (ip_result), 1): Result_td=ip_result[i].find_all ("TD") ip.append (result_td[1].string) port.append (result_td[2].string)Try: Server_addr.append (result_td[3].a.string)except: Server_addr.append (result_td[3].string) is_niming.append (result_td[4].string) type.append (result_td[5].string) time.append (result_td[9].string.split (" ") [0])exceptException:Print(Exception)Print("something wrong happened") returnIp,port,server_addr,is_niming,type,timeif __name__=="__main__": #gets the current time and truncates its first 2 bitsCurrentTime = Datetime.datetime.now (). Strftime ("%y-%m-%d") [2:] #Create a openerCookie=Http.cookiejar.CookieJar () Cookiehandle=urllib.request.HTTPCookieProcessor (cookie) #proxy={"HTTPS":"https://222.85.50.64:808"} #proxyHandle=Urllib.request.ProxyHandler (proxy) opener=Urllib.request.build_opener (Cookiehandle) #opener. Add_handler (Proxyhandle) #Create a header, disguised as a browser accessHeader= { "user-agent":"mozilla/5.0 (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/57.0.2987.133 safari/537.36"} head=[] forKey,valueinchHeader.items (): Enum=(Key,value) head.append (enum)#add head to openeropener.addheaders=Head#addresses that need to be crawledUrl="http://www.xicidaili.com/nn/{num}/"Is_over=True #instantiating an objectGpi=Getproxyip (opener)#book=XLWT. Workbook () sheet=book.add_sheet (sheetname=currenttime) sheet.write (0, 0,"IP Address") sheet.write (0,1,"Port") sheet.write (0,2,"Server Address") sheet.write (0,3,"Anonymous") sheet.write (0,4,"type") sheet.write (0,5,"Date") #Initialize _num to 1_num=1#start at the beginning of the initialization positionindex =0 while(is_over):#temp is used to record whether the proxy IP is the same day, if it is not recorded in its locationTemp=-1URL1=url.format (num=_num) HTML=GPI. Gethtmlpage (URL1) result=gpi.cleanhtml (HTML) forKinchRange (Len (result[5])): ifresult[5][k]!=currenttime:temp=k Is_over=False Break #if temp=-1, write it all ifTemp==-1: forIinchRange (len (result)): forJinchRange (len (result[i)):Print("Yi writes"+str (result[i][j])) Sheet.write (Index+j+1, I,result[i][j])Else: forKinchRange (len (result)): forKkinchRange (temp):Print("Yi writes"+str (RESULT[K][KK])) Sheet.write (Index+kk+1, K, Result[k][kk]) _num+ = 1Index+=Len (result[0]) time.sleep (16) # Print("Write Complete") Book.save ("Proxy.xls")
Python crawler crawl Proxy IP