IP proxy pool-based on MongoDB database

Source: Internet
Author: User
Tags xpath

Code with the python2.7, crawl Xici free agent, detection into the database, for later crawler preparation. Directly below the code

1 #-*-encoding=utf-8-*-2 3 ImportRequests4  fromlxmlImportetree5 Import Time6 ImportPymongo7  fromMultiprocessingImportPool8 9 Ten classGetProxy (object): One     def __init__(self): ASelf.headers = {'user-agent':'mozilla/5.0 (Windows NT 6.1; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/59.0.3071.115 safari/537.36'} -Self.url ='http://www.xicidaili.com/wt/' -Self.client = Pymongo. Mongoclient ('localhost', 27017) theSelf.xici = self.client['Xici'] -Self.xiciipinfo =self.xici['Xiciipinfo'] -         #Self.removeip = ' 127.0.0.1 ' #第一次运行会检测该变量, because only the following test fails to assign a value -  +     defGetIP (self,num): -         #Climbing Shrine All agents, updates into the database +url = self.url +str (num) AWb_data = Requests.get (URL, headers=self.headers) atHTML =etree. HTML (Wb_data.text) -         #htmls = etree.tostring (HTML) -IPS = Html.xpath ('//tr[@class = "odd"]/td[2]/text ()') -Ports = Html.xpath ('//tr[@class = "odd"]/td[3]/text ()') -protocols = Html.xpath ('//tr[@class = "odd"]/td[6]/text ()') -Areas = Html.xpath ('//tr[@class = "odd"]/td[4]/a/text ()') in          forIP, port, protocol, areainchZip (IPs, ports, protocols, areas): -data = { to                 'IP': IP, +                 'Port': Port, -                 'Protocol': Protocol, the                 ' Area': Area, *             } $             PrintDataPanax Notoginseng             #self.xiciipinfo.insert_one (data) -             #if Self.removeip! = IP: #此处加一个判断, if it is an unused IP that is detected below, it will not be updated to enter the database, can save the following detection time theSelf.xiciipinfo.update ({'IP': IP}, {'$set':d ATA}, True) +  A  the     defcount (self,num): +          forIinchRange (1, num): - Self.getip (i) $Time.sleep (2) $  -  -     defdbclose (self): the self.client.close () - Wuyi  the     defgetiplist (self): -         #organize data in the database into a list WuIPS =Self.xiciipinfo.find () -Proxylist = [] About          forIinchIPs: $b ="http"+"://"+ i['IP'] +":"+ i['Port'] -Proxies = {"http": b} -             #Print Proxies - proxylist.append (proxies) A         #Print Proxylist +         returnproxylist the  -     defiptest (self, proxy): $         #detects IP and updates into the database, deleting the unavailable IP theIP = proxy['http'][7:].split (':') [0] the         Try: theRequests.get ('http://wenshu.court.gov.cn/', Proxies=proxy, timeout = 6) the         except: -             Print 'field...............>>>>>>>>>>>>>>>>>>>>> >>>' in             #self.removeip = IP #赋值给类属性 theSelf.xiciipinfo.remove ({'IP': IP})#use the Remove method to delete the qualified the             Print 'Remove it now .....  {}'. Format (IP) About         Else: the             Print '<<<<<<<<<<<<<<<<<.............success' the             PrintProxy the  +  - if __name__=='__main__': thePool =Pool ()BayiProxy =GetProxy () theProxy.count (2) theIPList =proxy.getiplist () - map (proxy.iptest, IPList) -Proxy.dbclose ()

IP proxy pool-based on MongoDB database

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.