os win7 python2.7
#coding=utf8import urllib2,re,os import threadingimport time,datetimedef get_proxy_addr(urls,ports): proxylist = [] p = re.compile('''<tr><td>(.+?)<SCRIPT type=text/javascript>document.write\(":"\+(.+?)\)</SCRIPT></td><td>(.+?)</td><td>.+?</td><td>(.+?)</td></tr>''') for url in urls: res = urllib2.urlopen(url) pageinfo = res.read() #print pageinfo ips = p.findall(pageinfo) #根據需要構造出一定格式的條目 for row in ips: ip = row[0] port = map(lambda x:ports[x],row[1].split('+')) port = ''.join(port) agent = row[2] addr = row[3] l = [ip, port, agent, addr] proxylist.append(l) print u'資料分析完畢開始返回--------------------------------------------' return proxylistclass ProxyCheck(threading.Thread): ''' 用來檢查擷取到的代理是否可用 以及在本網上的速度 ''' def __init__(self,proxylist): threading.Thread.__init__(self) self.proxylist = proxylist self.timeout = 10 self.test_url = "http://www.baidu.com" self.test_str = '030173' self.checkedPorxyList = [] def checkPorxy(self): #第一步啟用 cookie cookies = urllib2.HTTPCookieProcessor() for proxy in self.proxylist: proxy_server = r'http://%s:%s' %(proxy[0],proxy[1]) #第二步 裝載代理 proxy_hander = urllib2.ProxyHandler({"http":proxy_server}) #第三步 組合request try: opener = urllib2.build_opener(cookies, proxy_hander) pass except urllib2.URLError: print u'url設定錯誤' continue #配置request opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1')] #發送請求 urllib2.install_opener(opener) t1 = time.time() try: req = urllib2.urlopen(self.test_url,timeout=self.timeout) result = req.read() pos = result.find(self.test_str) timeused = time.time() - t1 if pos>1: self.checkedPorxyList.append((proxy[0],proxy[1],proxy[2],proxy[3],timeused)) print u'成功採集',proxy[0],timeused else: continue except Exception,e: print proxy[0],'timeout' continue def sort(self): sorted(self.checkedPorxyList,cmp=lambda x,y:cmp(x[4],y[4])) def save(self): path = os.getcwd() filename = path + '/Proxy-'+datetime.datetime.now().strftime(r'%Y%m%d%H%M%S')+'.txt' f = open(filename,'w+') for proxy in self.checkedPorxyList: f.write('%s %s %s %s %s \r\n'%(proxy[0],proxy[1],proxy[2],proxy[3],proxy[4])) f.close() def run(self): print u'代理檢查開始--------------------------------------' self.checkPorxy() self.sort() print '開始儲存-----' self.save() print u'資料擷取完畢---------------------------------------' if __name__=='__main__': urls = (r'http://www.cnproxy.com/proxy1.html',) ports = {"z":"3","m":"4","a":"2","l":"9","f":"0","b":"5","i":"7","w":"6","x":"8","c":"1"} print u'頁面採集開始---------------------------------------------------' proxylist = get_proxy_addr(urls,ports) print u'代理測試開始---------------------------------------------------' proxychek = ProxyCheck(proxylist) proxychek.start() proxychek.join()