[Python]第八課筆記 簡單代理爬蟲

來源:互聯網
上載者:User

os win7   python2.7

#coding=utf8import urllib2,re,os import threadingimport time,datetimedef get_proxy_addr(urls,ports):    proxylist = []    p = re.compile('''<tr><td>(.+?)<SCRIPT type=text/javascript>document.write\(":"\+(.+?)\)</SCRIPT></td><td>(.+?)</td><td>.+?</td><td>(.+?)</td></tr>''')    for url in urls:        res = urllib2.urlopen(url)        pageinfo = res.read()        #print pageinfo        ips = p.findall(pageinfo)                #根據需要構造出一定格式的條目        for row in ips:             ip = row[0]            port = map(lambda x:ports[x],row[1].split('+'))            port = ''.join(port)            agent = row[2]            addr = row[3]            l = [ip, port, agent, addr]            proxylist.append(l)    print u'資料分析完畢開始返回--------------------------------------------'    return proxylistclass ProxyCheck(threading.Thread):    '''    用來檢查擷取到的代理是否可用 以及在本網上的速度    '''    def __init__(self,proxylist):        threading.Thread.__init__(self)        self.proxylist = proxylist        self.timeout = 10        self.test_url = "http://www.baidu.com"        self.test_str = '030173'        self.checkedPorxyList = []        def checkPorxy(self):        #第一步啟用 cookie        cookies = urllib2.HTTPCookieProcessor()        for proxy in self.proxylist:            proxy_server = r'http://%s:%s' %(proxy[0],proxy[1])            #第二步 裝載代理            proxy_hander = urllib2.ProxyHandler({"http":proxy_server})                        #第三步 組合request            try:                opener = urllib2.build_opener(cookies, proxy_hander)                pass             except urllib2.URLError:                print u'url設定錯誤'                continue             #配置request             opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1')]            #發送請求            urllib2.install_opener(opener)            t1 = time.time()            try:                req = urllib2.urlopen(self.test_url,timeout=self.timeout)                result = req.read()                pos = result.find(self.test_str)                timeused = time.time() - t1                 if pos>1:                    self.checkedPorxyList.append((proxy[0],proxy[1],proxy[2],proxy[3],timeused))                    print u'成功採集',proxy[0],timeused                else:                        continue            except Exception,e:                print proxy[0],'timeout'                continue                 def sort(self):        sorted(self.checkedPorxyList,cmp=lambda x,y:cmp(x[4],y[4]))            def save(self):        path = os.getcwd()        filename = path + '/Proxy-'+datetime.datetime.now().strftime(r'%Y%m%d%H%M%S')+'.txt'        f = open(filename,'w+')        for proxy in self.checkedPorxyList:            f.write('%s %s %s %s %s \r\n'%(proxy[0],proxy[1],proxy[2],proxy[3],proxy[4]))        f.close()                   def run(self):         print u'代理檢查開始--------------------------------------'        self.checkPorxy()        self.sort()        print '開始儲存-----'        self.save()        print u'資料擷取完畢---------------------------------------'               if __name__=='__main__':    urls = (r'http://www.cnproxy.com/proxy1.html',)    ports = {"z":"3","m":"4","a":"2","l":"9","f":"0","b":"5","i":"7","w":"6","x":"8","c":"1"}    print u'頁面採集開始---------------------------------------------------'    proxylist = get_proxy_addr(urls,ports)    print u'代理測試開始---------------------------------------------------'    proxychek = ProxyCheck(proxylist)    proxychek.start()    proxychek.join()    

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.