Code with the python2.7, crawl Xici free agent, detection into the database, for later crawler preparation. Directly below the code
1 #-*-encoding=utf-8-*-2 3 ImportRequests4 fromlxmlImportetree5 Import Time6 ImportPymongo7 fromMultiprocessingImportPool8 9 Ten classGetProxy (object): One def __init__(self): ASelf.headers = {'user-agent':'mozilla/5.0 (Windows NT 6.1; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/59.0.3071.115 safari/537.36'} -Self.url ='http://www.xicidaili.com/wt/' -Self.client = Pymongo. Mongoclient ('localhost', 27017) theSelf.xici = self.client['Xici'] -Self.xiciipinfo =self.xici['Xiciipinfo'] - #Self.removeip = ' 127.0.0.1 ' #第一次运行会检测该变量, because only the following test fails to assign a value - + defGetIP (self,num): - #Climbing Shrine All agents, updates into the database +url = self.url +str (num) AWb_data = Requests.get (URL, headers=self.headers) atHTML =etree. HTML (Wb_data.text) - #htmls = etree.tostring (HTML) -IPS = Html.xpath ('//tr[@class = "odd"]/td[2]/text ()') -Ports = Html.xpath ('//tr[@class = "odd"]/td[3]/text ()') -protocols = Html.xpath ('//tr[@class = "odd"]/td[6]/text ()') -Areas = Html.xpath ('//tr[@class = "odd"]/td[4]/a/text ()') in forIP, port, protocol, areainchZip (IPs, ports, protocols, areas): -data = { to 'IP': IP, + 'Port': Port, - 'Protocol': Protocol, the ' Area': Area, * } $ PrintDataPanax Notoginseng #self.xiciipinfo.insert_one (data) - #if Self.removeip! = IP: #此处加一个判断, if it is an unused IP that is detected below, it will not be updated to enter the database, can save the following detection time theSelf.xiciipinfo.update ({'IP': IP}, {'$set':d ATA}, True) + A the defcount (self,num): + forIinchRange (1, num): - Self.getip (i) $Time.sleep (2) $ - - defdbclose (self): the self.client.close () - Wuyi the defgetiplist (self): - #organize data in the database into a list WuIPS =Self.xiciipinfo.find () -Proxylist = [] About forIinchIPs: $b ="http"+"://"+ i['IP'] +":"+ i['Port'] -Proxies = {"http": b} - #Print Proxies - proxylist.append (proxies) A #Print Proxylist + returnproxylist the - defiptest (self, proxy): $ #detects IP and updates into the database, deleting the unavailable IP theIP = proxy['http'][7:].split (':') [0] the Try: theRequests.get ('http://wenshu.court.gov.cn/', Proxies=proxy, timeout = 6) the except: - Print 'field...............>>>>>>>>>>>>>>>>>>>>> >>>' in #self.removeip = IP #赋值给类属性 theSelf.xiciipinfo.remove ({'IP': IP})#use the Remove method to delete the qualified the Print 'Remove it now ..... {}'. Format (IP) About Else: the Print '<<<<<<<<<<<<<<<<<.............success' the PrintProxy the + - if __name__=='__main__': thePool =Pool ()BayiProxy =GetProxy () theProxy.count (2) theIPList =proxy.getiplist () - map (proxy.iptest, IPList) -Proxy.dbclose ()
IP proxy pool-based on MongoDB database