The code for work under Mac or Linux is as follows:
# coding=utf-8import Requestsimport refrom bs4 import beautifulsoup as Bsimport queueimport threadingimport randomimport R Eheaders_useragents = []headers_referers = []headers_referers.append (' http://www.google.com/?q= ') headers_ Referers.append (' http://www.usatoday.com/search/results?q= ') headers_referers.append (' HTTP// Engadget.search.aol.com/search?q= ') headers_useragents.append (' mozilla/5.0 (X11; U Linux x86_64; En-us; rv:1.9.1.3) gecko/20090913 firefox/3.5.3 ') headers_useragents.append (' mozilla/5.0 (Windows; U Windows NT 6.1; En rv:1.9.1.3) gecko/20090824 firefox/3.5.3 (. NET CLR 3.5.30729) ') headers_useragents.append (' mozilla/5.0 (Windows; U Windows NT 5.2; En-us; rv:1.9.1.3) gecko/20090824 firefox/3.5.3 (. NET CLR 3.5.30729) ') headers_useragents.append (' mozilla/5.0 (Windows; U Windows NT 6.1; En-us; rv:1.9.1.1) gecko/20090718 firefox/3.5.1 ') headers_useragents.append (' mozilla/5.0 (Windows; U Windows NT 5.1; En-US) applewebkit/532.1 (khtml, like Gecko) chrome/4.0.219.6 Safari/532.1 ') headers_useragents.append (' mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; trident/4.0; SLCC2;. NET CLR 2.0.50727; infopath.2) headers_useragents.append (' mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; trident/4.0; SLCC1; NET CLR 2.0.50727;. NET CLR 1.1.4322;. NET CLR 3.5.30729;. NET CLR 3.0.30729) ' Headers_useragents.append (' mozilla/ 4.0 (compatible; MSIE 8.0; Windows NT 5.2; Win64; x64; trident/4.0) headers_useragents.append (' mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; trident/4.0; SV1;. NET CLR 2.0.50727; infopath.2) ' Headers_useragents.append (' mozilla/5.0 (Windows; U MSIE 7.0; Windows NT 6.0; En-US) ' Headers_useragents.append (' mozilla/4.0 (compatible; MSIE 6.1; Windows XP) ' Headers_useragents.append (' opera/9.80 (Windows NT 5.2; U RU) presto/2.5.22 version/10.51 ') class Proxypick (threading. Thread): Def __init__ (self, queue): Threading. Thread.__init__ (self) self._queue = Queue def run (self): And not Self._queuE.empty (): url = self._queue.get () proxy_spider (URL) def proxy_spider (URL): headers = {# ....... } headers[' user-agent ' = Random.choice (headers_useragents) headers[' cache-control '] = ' no-cache ' headers[' Accept-Char Set '] = ' iso-8859-1,utf-8;q=0.7,*;q=0.7 ' headers[' Referer ' = Random.choice (headers_referers) + str (random.randint (5, ) headers[' keep-alive ' = str (random.randint ()) headers[' Connection '] = ' keep-alive ' r = Requests.get ( Url=url, headers=headers) soup = BS (r.content, "html.parser") data = Soup.find_all (name= ' tr ', attrs={' class ': re.com Pile (' |[ ^odd] ')}) for i in data:soup = BS (str (i), ' html.parser ') data2 = Soup.find_all (name= ' td ') IP = St R (data2[1].string) port = str (data2[2].string) types = str (data2[5].string). lower () proxy = {} Proxy[types] = '%s:%s '% (IP, port) print proxy, "Check proxy" Try:proxy_check (proxy, IP) Except Exception, E:print e passdef proxy_check (proxy, IP): # url = ' Http://1212.ip138.com/ic. ASP ' # URL = ' https://www.ipip.net/ip.html ' # url = ' http://www.baid.com ' # url = ' http://ip138.com/' url = ' h Ttp://2018.ip138.com/ic.asp ' r = Requests.get (Url=url, Proxies=proxy, timeout=6) # r.encoding = ' gb2312 ' for url = ' http://ip138.com/' Reip = R ' \[(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) \] ' # print R.text f = open (' Ip_proxy.txt ', ' A + ') found = Re.search (Reip, R.text, re. M | Re. I) If found:ip2 = Found.group (1) print "ip==>:", ip2 if ip2 = = Ip:print "*" * 30 Print "IP is wanted:", IP f.write ('%s '% proxy + ' \ n ') print "*" * # import SYS # sys.exit (0) F.close () # Proxy_spider () def main (): queue = Queue.queue () for I in range (1, 2288): QUEUE.P UT (' http://www.xicidaili.com/nn/' + str (i)) threads = [] Thread_count = Ten for I in range(thread_count): Spider = Proxypick (queue) threads.append (spider) for I in Threads:i.start () fo R i in Threads:i.join () print "It ' s down,sir!" if __name__ = = ' __main__ ': Main ()
Python scan proxy and get an instance of the available proxy IP from:https://www.jb51.net/article/120480.htm The following small series will bring you a Python scan proxy and obtain an instance of the available proxy IP. Small series feel very good, now share to everyone, also for everyone to make a reference. Let's take a look at it with a little knitting.
Today we write a very useful tool, is to scan and get the available proxy
First of all, I first Baidu to find a website: http://www.xicidaili.com as an example
Many of the IPs and ports available at home and abroad are published in this website.
We still look at the same as the old analysis, we will first sweep all the domestic proxy.
Click to open the domestic part of the review found that the domestic proxy and directory for the following URL:
http://www.xicidaili.com/nn/x
This x is almost 2000 pages, then it seems to be threading again ...
As usual, we try to get the content directly with the simplest requests.get ()
Return 503, then we add a simple headers
Return 200, come on
Okay, let's start with the Web content analysis and get what we want.
We found that the contents of the IP information contained in the <tr> tag, so we can easily use BS to obtain the label content
However, we subsequently found that the contents of IP, port and protocol were in the 2nd, 3, 63 <td> tags of the extracted <tr> tags.
r = requests.get (url = url,headers = headers) soup = BS (r.content, "html.parser") data = Soup.find_all (name = ' TR ', attrs = {' class ': Re.compile (' |[ ^odd] ')}) for i in data:soup = BS (str (i), ' html.parser ') data2 = soup.find_all (name = ' td ') IP = str (data2[1].string) port = str (data2[2].string) types = str (data2[5].string). lower () proxy = {} Proxy[types] = '%s:%s '% (ip,port)
This allows us to generate a corresponding proxy dictionary each time we loop, so that we can then verify the use of IP availability
There's a note in the dictionary here, we have an operation that turns types into lowercase, because the protocol name written in proxies in the Get method should be lowercase, and the page fetches uppercase content, so a case conversion
So what's the idea of verifying IP availability?
Very simply, we use GET, plus our agent, to request the website:
Http://1212.ip138.com/ic.asp
This is a magical website that can return what your extranet IP is
url = ' http://1212.ip138.com/ic.asp ' r = requests.get (url = url,proxies = Proxy,timeout = 6)
Here we need to add timeout to remove the agents that have waited too long, I set to 6 seconds
We try with an IP and parse the returned page
The returned content is as follows:
Then we just need to extract the contents of [] within the page.
If our agent is available, it returns the IP of the proxy
(here will appear the return address or our local network IP, although I am not very clear, but I exclude this situation, should still be agent unavailable)
Then we can make a judgment, if the IP and proxy dictionary are returned the same IP, the IP is considered to be an available proxy, and write it to the file
This is our idea, and finally the queue and threading threads are processed
On the code:
#coding =utf-8import requestsimport refrom bs4 import beautifulsoup as Bsimport Queueimport threading class Proxypick (thre Ading. Thread): Def __init__ (self,queue): Threading. Thread.__init__ (self) self._queue = Queue def run (self): And not Self._queue.empty (): url = self._queue.get () ProX Y_spider (URL) def proxy_spider (URL): headers = {...} r = requests.get (url = url,headers = headers) soup = BS (R.con Tent, "html.parser") data = Soup.find_all (name = ' TR ', attrs = {' class ': Re.compile (' |[ ^odd] ')}) for i in data:soup = BS (str (i), ' html.parser ') data2 = soup.find_all (name = ' td ') IP = str (data2[1].string) Port = str (data2[2].string) types = str (data2[5].string). lower () proxy = {} Proxy[types] = '%s:%s '% (ip,port) try:p Roxy_check (PROXY,IP) except Exception,e:print e passdef proxy_check (proxy,ip): url = ' http://1212.ip138.com/ic.asp ' r = requests.get (url = url,proxies = Proxy,timeout = 6) F = open (' E:/url/ip_proxy.txt ', ' A + ') soup = BS (R.text, ' Html.parser ') data = sOup.find_all (name = ' center ') for i in data:a = Re.findall (R ' \[(. *?) \] ', i.string) if a[0] = = IP: #print proxy f.write ('%s '%proxy+ ' \ n ') print ' Write Down ' f.close () #proxy_spider () de F Main (): queue = Queue.queue () for I in Range (1,2288): Queue.put (' http://www.xicidaili.com/nn/' +str (i)) threads = [] Thr Ead_count = Ten for I in Range (thread_count): Spider = Proxypick (queue) threads.append (spider) for I in Threads:i.start () for I in Threads:i.join () print "It ' s down,sir!" if __name__ = = ' __main__ ': Main ()
This allows us to write all of the proxy IPs available on the site to the file Ip_proxy.txt file.
Above this python scan proxy and get the available proxy IP instance is small to share all the content of everyone, I hope to give you a reference, but also hope that we support the script home.
Python scans proxy and gets a list of available proxy IPs