The simplest is to use Python's urllib2.urlopen () function;
Then, there is a website that likes to seal people, so you have to find a batch of agents to capture their information in turn;
Some websites do not allow programs to crawl, so some header information must be added;
Some websites need to log on and use cookies;
Finally, to improve efficiency, it is best to use multiple threads. (PS. Note that the urlopen function sets a Global Object opener. If you use multiple threads and each thread uses one proxy, instead of using the urlopen function, you should use opener. open)
The following is a script for catching proxy written in Python. Although it is no longer in CERNET, sometimes it still needs to be used :)
#-*-Coding: cp936 -*-
Import urllib2, re, thread, time
Import socket
Socket. setdefatimetimeout (10)
# ----------------------- Define the function for capturing the proxy -------------------------------#
Def getcnproxy (name ):
Pagenum = 0
Result = []
Getallpages = 0
Trycount = 0
While getallpages = 0 and trycount <= 6:
Pagenum = pagenum + 1
Url = 'HTTP: // www.proxycn.com/html_proxy/http-'{str (pagenum?#'.html'
Try:
Html = urllib2.urlopen (URL)
IP =''
For line in HTML:
If ''' ondblclick = "clip ''' in line:
Proxy = line [LINE. Find ("clip ('") + 6: line. Find ("')")]
Lock. Acquire ()
Print name, proxy
Lock. Release ()
Result. append (proxy)
If 'Next page | last page' in line:
Getallpages = 1
Except t:
Trycount = trycount + 1
Pagenum = pagenum-1
Proxylist [0] = Result
Return result
Def getproxycn (name ):
Pagenum = 0
Result = []
Getallpages = 0
Trycount = 0
While pagenum <= 9 and trycount <= 2:
Pagenum = pagenum + 1
Url = 'HTTP: // www.cnproxy.com/proxy'{str (pagenum?}'.html'
Try:
Html = urllib2.urlopen (URL)
For line in HTML:
If "HTTP" in line:
Proxy = line [line. find ('<TD>') + 4: line. find ('& #820')] + line [line. find (':'): line. find ('</TD> <TD>')]
Lock. Acquire ()
Print name, proxy
Lock. Release ()
Result. append (proxy)
Except t:
Trycount = trycount + 1
Pagenum = pagenum-1
Proxylist [1] = Result
Return result
# ------------------------- --------------- End proxy capture function definition --------------------------------------------------#
# ---------------------------------------- Define the function of the verification proxy ---------------------------------------------------#
Def proxycheckone (proxy ):
Url = 'HTTP: // www.facebook.com'
Proxy_url = 'HTTP: // '+ proxy
Proxy_support = urllib2.proxyhandler ({'http': proxy_url })
Opener = urllib2.build _ opener (proxy_support, urllib2.httphandler)
R = urllib2.request (URL)
R. add_header ("Accept-language", "ZH-CN") # Add header information to avoid 403 errors
R. add_header ("Content-Type", "text/html; charset = gb2312 ")
R. add_header ("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2;. Net CLR 1.1.4322 )")
Trycount = 1
While trycount <= 2:
Try:
T0 = Time. Time ()
F = opener. Open (r)
Data = f. Read ()
If 'Welcome to Facebook! 'In data:
T = time. Time ()-t0
Break
Else: return []
Except t:
Time. Sleep (3)
Trycount = trycount + 1
If trycount> 2:
Return []
Else:
Return proxy + '$' + STR (trycount) + '#' + STR (t)
Def proxycheck (idnum ):
While 1:
R. Acquire ()
Try:
I = proxylist [0]
Del proxylist [0]
R. Release ()
Except t:
R. Release ()
X [idnum] = 1
Break
B = proxycheckone (I)
If Len (B)> 0:
A. Acquire ()
Y. append (B)
A. Release ()
# ---------------------------------------- The Function Definition of the verification proxy ends -------------------------------------------------#
# ----------------------------- Capture proxy, the captured proxy is placed in proxies.txt, separated by/n --------------------------------#
# X = '''
Lock = thread. allocate_lock ()
Proxylist = [[], []
Thread. start_new (getcnproxy, ('cnproxy ',))
Thread. start_new (getproxycn, ('xycn ',))
While [] In proxylist:
Time. Sleep (30)
Proxylist = proxylist [0] + proxylist [1]
W=open('proxies.txt ', 'A ')
W. Write ('/N'. Join (proxylist ))
W. Close ()
Del proxylist
Print 'get all proxies! /N/N'
#'''
# ----------------------------- The capture proxy is complete, and the captured proxy is placed in proxies.txt, separated by/n -------------------------------#
# --------------------------------------------------- Verification proxy -----------------------------------------------------#
W=open('proxies.txt ')
Proxylist = List (SET (Re. sub (R' (/T + [^/n] */n |/n) ', W. read ())). split (',')))
While ''In proxylist:
Del proxylist [proxylist. Index ('')]
W. Close ()
Lock = thread. allocate_lock ()
R = thread. allocate_lock ()
A = thread. allocate_lock ()
Y = []
X = [0] x 120
For idnum in range (0,120 ):
Thread. start_new (proxycheck, (idnum ,))
While 0 in X:
Print Len (proxylist), sum (x), "Left", Len (y)
Time. Sleep (10)
W=open('proxies.txt ', 'w ')
W. write (Re. sub ('^/N', '', re. sub (R'/n + ','/N', '/N '. join (y) + '/N ')))
W. Close ()
# -------------------------------------------------- Verification proxy is completed --------------------------------------------------#
Source code and compiled programs are downloaded here