python 爬取可用

來源:互聯網
上載者:User

標籤:user   use   ip地址   parse   設定   ipc   col   http   random   

#coding:utf-8from bs4 import BeautifulSoupimport timeimport threadingimport randomimport telnetlib,requests#設定全域逾時時間為3s,也就是說,如果一個請求3s內還沒有響應,就結束訪問,並返回timeout(逾時)import socketsocket.setdefaulttimeout(3)headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36",}def get_ip():    #擷取代理IP,返回列表    httpResult=[]    httpsResult=[]    try:        for page in range(1,10):            IPurl = ‘http://www.xicidaili.com/nn/%s‘ %page            rIP=requests.get(IPurl,headers=headers)            IPContent=rIP.text            #print (IPContent)            soupIP = BeautifulSoup(IPContent,‘html.parser‘)#lxml            trs = soupIP.find_all(‘tr‘)            for tr in trs[1:]:                tds = tr.find_all(‘td‘)                ip = tds[1].text.strip()                port = tds[2].text.strip()                protocol = tds[5].text.strip()                if protocol == ‘HTTP‘:                    httpResult.append( ‘http://‘ + ip + ‘:‘ + port)                elif protocol ==‘HTTPS‘:                    httpsResult.append( ‘https://‘ + ip + ‘:‘ + port)    except Exception as inst:        print (inst)    return httpResult,httpsResult#驗證ip地址的可用性,使用requests模組,驗證地址用相應要爬取的網頁 httpdef cip(x,y):    f = open("E:\ip_http.txt","a")    f.truncate()    try:        print (x+y)        requests.get(‘http://ip.chinaz.com/getip.aspx‘,proxies={‘http‘:x+":"+y},timeout=3)    except:        print(‘f‘)    else:        print(‘---------------------------success‘)        f.write(x+‘:‘+y+‘\n‘)#驗證ip地址的可用性,使用requests模組,驗證地址用相應要爬取的網頁。httpsdef csip(x,y):    f = open("E:\ip_https.txt","a")    f.truncate()    try:        print (x+y)        requests.get(‘https://www.lagou.com/‘,proxies={‘https‘:x+":"+y},timeout=3)    except:        print(‘f‘)    else:        print(‘---------------------------success‘)        f.write(x+‘:‘+y+‘\n‘)def main():    httpResult,httpsResult = get_ip()    print(len(httpResult), len(httpsResult))    threads = []    open("E:\ip_http.txt","a").truncate()    for i in httpResult:        a = str(i.split(":")[-2][2:].strip())        b = str(i.split(":")[-1].strip())        t = threading.Thread(target=cip,args=(a,b,))        threads.append(t)    for i in range(len(httpResult)):        threads[i].start()    for i in range(len(httpResult)):        threads[i].join()    threads1 = []    open("E:\ip_https.txt","a").truncate()    for i in httpsResult:        a = str(i.split(":")[-2][2:].strip())        b = str(i.split(":")[-1].strip())        t = threading.Thread(target=csip,args=(a,b,))        threads1.append(t)    for i in range(len(httpsResult)):        threads1[i].start()    for i in range(len(httpsResult)):        threads1[i].join()if __name__ == ‘__main__‘:    main()

 

python 爬取可用

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.