1、http://www.xicidaili.com/wt 國內免費代理網站
2、利用scrapy爬取該網站內的IP地址與連接埠,寫入txt文檔
3、編寫指令碼測試txt文檔中的ip地址與連接埠是否可用
4、將可用ip地址與連接埠輸入txt文檔
————————————————————————
1、編寫Item類
由於我們只需要ip地址與連接埠,所以唯寫一個屬性即可
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass IpItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() passclass IpInfoItem(scrapy.Item): ip=scrapy.Field()
2、編寫spider
# -*- coding: utf-8 -*-import scrapyimport syssys.path.append("D:\\pycodes\\ip")from ip.items import IpInfoItemclass IpSpider(scrapy.Spider): name = 'Ip' start_urls = [] #爬取5頁網站的IP for i in range(1,6): start_urls.append('http://www.xicidaili.com/wt/'+str(i)) def parse(self, response): item = IpInfoItem() for sel in response.xpath('//tr'): ip= sel.xpath('.//td[2]/text()').extract_first() port=sel.xpath('.//td[3]/text()').extract_first() item['ip']=str(ip)+":"+str(port) yield item
3、編寫pipeline
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlclass IpPipeline(object): def process_item(self, item, spider): return itemclass IpInfoPipeline(object): def process_item(self,item,spider): try: #我們只需要IP地址與連接埠,因此只把字典值寫進txt檔案 content = item['ip'] open("xinresult.txt","a").write(content+"\n") except: pass return item
至此,我們從網站上爬下來了5頁的IP,需要編寫指令碼進行測試
import requestsalive_ip=[]def test_alive(proxy): global alive_ip for proxies_be in proxy: #request中的IP地址需要以下列形式的參數寫進函數 proxies={"http":proxies_be} print("正在測試:{}".format(proxies)) try: r = requests.get("http://www.baidu.com",proxies=proxies,timeout=2) if r.status_code==200: print("成功,ip為{}".format(proxies)) alive_ip.append(proxies_be) else: print("失敗") except: print("失敗")def out_file(alive_ip=[]): with open ("alive_ip.txt","w") as f: for ip in alive_ip: f.write(str(ip)+"\n") print("輸出完畢")def test(filename="blank.txt"): with open(filename,"r") as f: lines = f.readlines() proxys=list(map(lambda x:x.strip(),lines)) test_alive(proxys) out_file(alive_ip)test("xinresult.txt")