Python crawler practice (1) -- real-time access to proxy ip addresses and python Crawlers
It is very important to maintain a proxy pool during crawler learning.
Code for details:
1. runtime environment python3.x, requirement Library: bs4, requests
2. Capture the proxy ip address of the first three pages of the domestic high-risk proxy in real time (which can be freely modified as needed)
3. multi-thread verification of the captured proxy and storage of the verified proxy ip Address
#-*-Coding: utf8-*-import re, threading, requests, timeimport urllib. requestfrom bs4 import BeautifulSoup as BSrawProxyList = [] checkedProxyList = [] targets = [] headers = {'user-agent': r'mozilla/5.0 (Windows NT 5.1) appleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 ', 'connection': 'Keep-alive'} for I in range ): target = r "http://www.xicidaili.com/nn/%d" % I targets. append (target) # print (targets) # obtain the proxy class ProxyGet (threading. thread): def _ init _ (self, target): threading. thread. _ init _ (self) self.tar get = target def getProxy (self): print ("target Website:" +self.tar get) r = requests.get(self.tar get, headers = headers) page = r. text soup = BS (page, "lxml") # Here, class _ uses "Searching by CSS class". The BS document details tr_list = soup. find_all ("tr", class _ = "odd") for I in range (len (tr_list): row = [] #. the stripped_strings method returns the string object of Python that removes the leading and trailing spaces. for text in tr_list [I]. stripped_strings: row. append (text) # row = ['58. 208.16.141 ', '123', 'jiangsu Suzhou', 'highly', 'HTTP,...] ip = row [0] port = row [1] agent = row [4]. lower () addr = agent + ": //" + ip + ":" + port proxy = [ip, port, agent, addr] rawProxyList. append (proxy) def run (self): self. getProxy () # verify the proxy class ProxyCheck (threading. thread): def _ init _ (self, proxyList): threading. thread. _ init _ (self) self. proxyList = proxyList self. timeout = 2 self. testUrl = "https://www.baidu.com/" def checkProxy (self): for proxy in self. proxyList: proxies = {} if proxy [2] = "http": proxies ['http'] = proxy [3] else: proxies ['https'] = proxy [3] t1 = time. time () try: r = requests. get (self. testUrl, headers = headers, proxies = proxies, timeout = self. timeout) time_used = time. time ()-t1 if r: checkedProxyList. append (proxy [0], proxy [1], proxy [2], proxy [3], time_used) else: continue failed t Exception as e: continue def run (self): self. checkProxy () print ("hello") if _ name _ = "_ main __": getThreads = [] checkedThreads = [] # enable a thread for each target website to capture the proxy for I in range (len (targets): t = ProxyGet (targets [I]) getThreads. append (t) for I in range (len (getThreads): getThreads [I]. start () for I in range (len (getThreads): getThreads [I]. join () print ('. '* 10 + "captured % s proxies in total" % len (rawProxyList) + '. '* 10) # enable 20 threads for verification and divide the captured proxy into 20 parts. Each thread verifies one copy for I in range (10): n = len (rawProxyList) /10 # print (str (int (n * I) + ":" + str (int (n * (I + 1 )))) t = ProxyCheck (rawProxyList [int (n * I): int (n * (I + 1)]) checkedThreads. append (t) for I in range (len (checkedThreads): checkedThreads [I]. start () for I in range (len (checkedThreads): checkedThreads [I]. join () print ('. '* 10 + "A total of % s proxies passed the verification" % len (checkedProxyList) + '. '* 10) # persistence f = open ("proxy_list.txt", 'W +') for checked_proxy in sorted (checkedProxyList): print ("checked proxy is: % s \ t % s "% (checked_proxy [3], checked_proxy [4]) f. write ("% s: % s \ t % s \ n" % (checked_proxy [0], checked_proxy [1], checked_proxy [2], checked_proxy [3], checked_proxy [4]) f. close ()View Code