Code:
From BS4 import beautifulsoupfrom requests import Session, GET, postfrom time import Sleepimport randomimport Re, osclass Proxyippool (object): Def __init__ (self,page): object.__init__ (self) self.page = page def init_proxy_ip _pool (self): url = ' https://www.kuaidaili.com/free/' tablelist = [' IP ', ' PORT ', ' type ', ' location '] IP = [] Port = [] Type = [] Position = [] R = Session () headers = {' Accept ': ' text/h tml,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 ', ' Accept-Encoding ': ' Gzip, def Late, Br ', ' accept-language ': ' zh-cn,zh;q=0.9 ', ' Connection ': ' keep-alive ', ' Host ': ' WWW.K Uaidaili.com ', # ' Referer ': url, # The URL for each page when you click on the next page is: Link from the previous page to the current page. For example: from Baidu into the proxy IP first page of the Referer URL is Baidu link ' upgrade-insecure-requests ': ' 1 ', ' user-agent ': ' mozilla/5.0 ( Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, LIke Gecko) chrome/64.0.3282.168 safari/537.36 '} if self.page > 1:url = URL + ' inha/' + str (s Elf.page) + '/' request = R.get (url,headers=headers,timeout=2,) print (request.status_code) soup = Bea Utifulsoup (Request.text, ' lxml ') tags = soup.find_all (' TD ', attrs={' data-title ': tablelist}) # get all IPs Ip_tag_match = Re.compile (R ' data-title= "IP" > (. +?) </td ') ip.append (Ip_tag_match.findall (str (tags))) # get all ports Port_tag_match = Re.compile (R ' data-titl E= "PORT" > (. +?) </td ') port.append (Port_tag_match.findall (str (tags))) # get all types Type_match = Re.compile (R ' data-titl E= "Type" > (. +?) </td ') type.append (Type_match.findall (str (tags))) # get all locations Position_tag_match = Re.compile (R ' data- title= "Location" > (. +?) </td ') position.append (Position_tag_match.findall (str (tags))) Sleep (Random.random () *7) # IP, port, t Ype, position as a dictionary save data_title = {' IP ': IP, ' Port ': Port, ' type ': type, ' position ': position} return data_titledef Create_proxy_ip_pool (page): Pool = Proxyippool (page). Init_proxy_ip_pool () print (' Initialize complete! ' to start creating Agent pool ... ') IPList = pool.get (' ip ') portlist = Pool.get (' P Ort ') Typelsit = Pool.get (' type ') positionlist = pool.get (' position ') for I in range (0, Len (iplist[0]): P Rint (Format (iplist[0][i], ' <22 ') + format (portlist[0][i], ' <17 ') + format (typelsit[0][i], ' <12 ') + Positionlist[0][i]) Try:with open (' c:/users/adimin/desktop/proxyip.txt ', ' a ') as FP:FP.W Rite (Format (iplist[0][i], ' <22 ') + format (portlist[0][i], ' <17 ') + format (typelsit[0][i], ' <12 ') + Positionlist[0][i] + ' \ r \ n ') except Fileexistserror as Err:print (err) os._exit (2) if __name__ = = ' __main__ ': print (' Initializing Agent pool ... Please be patient ... ') print (Format (' IP ', ' ^16 ') + format (' PORT ', ' ^16 ') + format (' type ', ' ^16 ') + format (' location ', ' ^16 ')) Try: With open (' C:/users/adimin/desktop/proxyip.txt ', ' a ') as Fp:fp.write (format (' IP ', ' ^16 ') + format (' PORT ', ' ^16 ') + format (' type ', ' ^16 ') + format (' location ', ' ^16 ') + ' \ r \ n ') Except:with open (' C:/users/adimin/desktop/proxyip.txt ', ' W ') as FP: Fp.write (Format (' IP ', ' ^16 ') + format (' PORT ', ' ^16 ') + format (' type ', ' ^16 ') + format (' location ', ' ^16 ') + ' \ r \ n ') # do not know for What can only be done outside the loop to crawl multiple pages of IP if you change the code to loop in the Init_proxy_ip_pool function, you can crawl a little more than one page ... for i in range (1,2177): Create_proxy_ip_pool (i )
Operation Result:
Save to Local:
Python Learning-building an IP proxy pool