Second, the Code
The code directly refers to the following, more interpretation see the original, where the IP extract part of the revised, and will be used to test whether IP is available Baidu URL changed to the CSDN blog article URL.
Enter the group: 548377875 can get dozens of sets of PDFs Oh!
Also tried the next watercress ... Well, immediately on the "403", not to go, so read this article want to brush other website data friends, or think twice, if the account is sealed, not responsible for OH. = =.
Code (py3.5 version): Source!
# Coding:utf-8
Import Urllib.request
Import Urllib.parse
Import time
From multiprocessing import pool# multi-process
Import Random
From lxml import etree #解析
Def getuseragent ():
‘‘‘
function: Get http_user_agent randomly
‘‘‘
user_agents=[
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Acoobrowser;. NET CLR 1.1.4322;. NET CLR 2.0.50727) ",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;. NET CLR 2.0.50727; Media Center PC 5.0;. NET CLR 3.0.04506) ",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; Aolbuild 4337.35; Windows NT 5.1;. NET CLR 1.1.4322,. NET CLR 2.0.50727) ",
"Mozilla/5.0 (Windows; U MSIE 9.0; Windows NT 9.0; En-US) ",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; trident/5.0;. NET CLR 3.5.30729;. NET CLR 3.0.30729;. NET CLR 2.0.50727; Media Center PC 6.0) ",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; trident/4.0; WOW64; trident/4.0; SLCC2;. NET CLR 2.0.50727;. NET CLR 3.5.30729;. NET CLR 3.0.30729;. NET CLR 1.0.3705;. NET CLR 1.1.4322) ",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2;. NET CLR 1.1.4322;. NET CLR 2.0.50727; infopath.2;. NET CLR 3.0.04506.30) ",
"Mozilla/5.0 (Windows; U Windows NT 5.1; ZH-CN) applewebkit/523.15 (khtml, like Gecko, safari/419.3) arora/0.3 (change:287 c9dfb30) ",
"Mozilla/5.0 (X11; U Linux; En-US) applewebkit/527+ (khtml, like Gecko, safari/419.3) arora/0.6 ",
"Mozilla/5.0 (Windows; U Windows NT 5.1; En-us; Rv:1.8.1.2pre) gecko/20070215 k-ninja/2.1.1 "
]
User_agent = Random.choice (user_agents)
Return user_agent
Def getproxies ():
‘‘‘
Function: Crawl the West Thorn high Stealth IP constructs the original proxy IP pool
‘‘‘
Init_proxies = []
# #爬取前五页
For I in Range (1,6):
Print ("# # #")
Print ("# # #爬取第" +str (i) + "page # # # # #")
Print ("# # #")
Print ("IP Address Port Survival time Verification Time")
url = "http://www.xicidaili.com/nn/" +str (i)
User_agent = Getuseragent ()
headers= ("User-agent", User_agent)
Opener = Urllib.request.build_opener ()
Opener.addheaders = [Headers]
Try
data = Opener.open (url,timeout=5). Read ()
Except Exception as ER:
Print ("An error occurred while crawling, as follows:")
Print (ER)
Selector=etree. HTML (data)
‘‘‘
Ip_addrs = Selector.xpath ('//tr[@class = ' odd ']/td[2]/text () ') #IP地址
Port = Selector.xpath ('//tr[@class = ' odd ']/td[3]/text () ') #端口
Sur_time = Selector.xpath ('//tr[@class = ' odd ']/td[9]/text () ') #存活时间
Ver_time = Selector.xpath ('//tr[@class = ' odd ']/td[10]/text () ') #验证时间
‘‘‘
#原代码只提取了奇数行部分, modify the following
Ip_addrs = Selector.xpath ('//tr/td[2]/text () ') #IP地址
Port = Selector.xpath ('//tr/td[3]/text () ') #端口
Sur_time = Selector.xpath ('//tr/td[9]/text () ') #存活时间
Ver_time = Selector.xpath ('//tr/td[10]/text () ') #验证时间
For j in Range (Len (Ip_addrs)):
ip = ip_addrs[j]+ ":" +port[j]
Init_proxies.append (IP)
#输出爬取数据
Print (ip_addrs[j]+ "" +port[j]+ "" +sur_time[j]+ "" +ver_time[j])
Return init_proxies
def testproxy (CURR_IP):
‘‘‘
function: Verify IP validity
@curr_ip: IP that is currently authenticated
‘‘‘
Tmp_proxies = []
#socket. Setdefaulttimeout (#设置全局超时时间)
#tarURL = "http://www.baidu.com/"
#tarURL = "http://blog.csdn.net/deserts_x/article/details/76726885" #CSDN-old tree painting
Tarurl = "http://blog.csdn.net/deserts_x/article/details/76409116" #CSDN-shu
User_agent = Getuseragent ()
Proxy_support = Urllib.request.ProxyHandler ({"http": Curr_ip})
Opener = Urllib.request.build_opener (Proxy_support)
opener.addheaders=[("User-agent", User_agent)]
Urllib.request.install_opener (opener)
Try
res = Urllib.request.urlopen (tarurl,timeout=5). Read ()
If Len (res)!=0:
Tmp_proxies.append (CURR_IP)
Except Urllib.error.URLError as Er2:
If Hasattr (Er2, "code"):
Print ("Authentication proxy IP (" +curr_ip+ ") error occurred (error code):" +str (Er2.code))
If Hasattr (Er2, "Reason"):
Print ("Authentication Agent IP (" +curr_ip+ ") error occurred (Error reason):" +str (Er2.reason))
Except Exception as ER:
Print ("Verify Proxy IP (" +curr_ip+ ") when the following error occurred):")
Print (ER)
Time.sleep (2)
Return tmp_proxies
def multestproxies (init_proxies):
‘‘‘
Function: Multi-process authentication IP validity
@init_proxies: Original unauthenticated proxy IP pool
‘‘‘
Pool = Pool (processes=7)
Fl_proxies = Pool.map (testproxy,init_proxies)
Pool.close ()
Pool.join () #等待进程池中的worker进程执行完毕
Return fl_proxies
if __name__ = = ' __main__ ':
#---(1) Get proxy IP pool
Init_proxies = Getproxies () #获取原始代理IP
Tmp_proxies = Multestproxies (init_proxies) #多进程测试原始代理IP
Proxy_addrs = []
For Tmp_proxy in Tmp_proxies:
If Len (tmp_proxy)!=0:
#print (Tmp_proxy)
Proxy_addrs.append (Tmp_proxy)
Print (len (Proxy_addrs))
Print (Proxy_addrs)
Third, summary
About crawling and maintaining IP agent pool articles online in fact, a lot of books on the book, for those who want to crawl into the order of the people must be met, after all, when you want to large-scale crawling of a website data, only rely on their own computer IP in the unit time a large number of visits, most will be an accident, and in order to crawl Need a lot of IP wheels off, fought on the ground.
Python SAO Operation! Use Python to crawl IP proxies! Secretly to the article Brush reading volume!