Article Abstract: In fact, there is no direct relationship between Casperjs and Python, mainly rely on Casperjs invoke PHANTOMJS webkit get HTML file content. For a long time, crawling the client JavaScript rendering generated HTML pages is extremely difficult, Java has htmlunit, and python, we can use the independent cross-platform Casperjs.
Create Site.js (interface file, input: URL, output: HTML file)
Usage:e:\toolkit\n1k0-casperjs-e3a77d0\bin>python Casperjs site.js--url=http://spys.ru/free-proxy-list/ie/- -outputfile= ' temp.html '
var fs = require (' FS ');
var casper = require (' Casper '). Create ({
pagesettings: {
loadimages:false,
loadplugins:false,
UserAgent: ' mozilla/5.0 (Windows NT 6.1) applewebkit/537.36 (khtml, like Gecko) chrome/34.0.1847.137 safari/537.36 LBBROW SER '
},
loglevel: "Debug",//log rank
verbose:true //Log to console
});
var url = casper.cli.raw.get (' url ');
var outputfile = casper.cli.raw.get (' outputfile ');
Request page
casper.start (URL, function () {
fs.write (outputfile, this.gethtml (), ' W ');
});
Casper.run ();
Python code, checkout_proxy.py
Import JSON import sys #import requests #import requests.utils, pickle from BS4 import BeautifulSoup Import Os.path,os Import Threading #from multiprocessing import Process, Manager from datetime import DateTime Import traceback Import logging import re,random Import subprocess Import Shutil I Mport Platform output_file = Os.path.join (Os.path.dirname (Os.path.realpath)), ' Proxy . txt ') Global_log = ' http_proxy ' + DateTime.Now (). Strftime ('%y-%m-%d ') + '. Log ' if not os.path.exists (os.path.jo In (Os.path.dirname (Os.path.realpath (__file__)), ' logs '): Os.mkdir (Os.path.join (os.path.dirname
(__file__)), ' logs ') Global_log = Os.path.join (Os.path.dirname (Os.path.realpath)), ' __file__ ', logs) Logging.basicconfig (level=logging. debug,format= ' [% (asctime) s] [% (LevelName) s] [% (module) s] [% (FuncName) s] [% (Lineno) d]% (message) s ', Filename=globaL_log,filemode= ' a ') log = Logging.getlogger (__name__) #manager = Manager () #PROXY_LIST = Manager.list () Mutex = Threading. Lock () proxy_list = [] def iswindows (): If "Windows" in Str (Platform.uname ()): return
True Else:return False def gettagsbyattrs (tagname,pagecontent,attrname,attrregvalue): Soup = BeautifulSoup (pagecontent) return Soup.find_all (TagName,
{AttrName:re.compile (Attrregvalue)}) def gettagsbyattrsext (tagname,filename,attrname,attrregvalue): if Os.path.isfile (filename): F = open (Filenam E, ' r ') soup = BeautifulSoup (f) f.close () return Soup.find_all (TagName, {attrName:re.compile Rregvalue)}) Else:return None class Site1thread (threading. Thread): Def __init__ (Self,outputfilepath): Threading.
Thread.__init__ (self) Self.outputfilepath = Outputfilepath self.filename = str (random.randint (100,1000)) + ". html" Self.setname (' Site1thread ') def run (self): Site1_file = Os.path.join (Os.path.dirname (Os.path.realpath (__file__)), ' s Ite.js ') Site2_file = Os.path.join (Self.outputfilepath, ' site.js ') if not os.path.isfile (site2_file) and OS.P Ath.isfile (Site1_file): Shutil.copy (site1_file,site2_file) #proc = subprocess. Popen (["Bash", "-C", "CD%s &&./casperjs site.js--url=http://spys.ru/free-proxy-list/ie/--outputfile=%s"% ( Self.outputfilepath,self.filename)],stdout=subprocess. PIPE) if Iswindows (): proc = subprocess. Popen (["cmd", "/C", "%s/casperjs site.js--url=http://spys.ru/free-proxy-list/ie/--outputfile=%s"% ( Self.outputfilepath,self.filename)],stdout=subprocess. PIPE) Else:proc = subprocess. Popen (["Bash", "-C", "CD%s &&/casperjs site.js--url=http://spys.ru/free-proxy-list/ie/--outputfile=%s "% (self.outputfilepath,self.filename)],stdout=subprocess. PIPE) out=proc.communicate () [0] htmlfilename = ' #因为输出路径在windows不确定, so this adds all possible paths to judge if Os.pat H.isfile (self.filename): Htmlfilename = Self.filename elif os.path.isfile (Os.path.join (Self.outputfilepath , Self.filename)): Htmlfilename = Os.path.join (self.outputfilepath,self.filename) elif Os.path.isfile (OS.PA Th.join (Os.path.dirname (Os.path.realpath (__file__), self.filename)): Htmlfilename = Os.path.join (Os.path.dirname ( Os.path.realpath (__file__)), self.filename) if (not Os.path.isfile (htmlfilename)): print ' Failed to get HT
ML content from http://spys.ru/free-proxy-list/IE/' Print out Sys.exit (3) Mutex.acquire () proxylist= gettagsbyattrsext (' font ', Htmlfilename, ' class ', ' spy14$ ') for proxy in proxylist:tdcontent = PR
Oxy.rendercontents () Lineelems = Re.split (' [<>] ', tdcontent) If Re.compile (R ' \d+ '). Search (Lineelems[-1]) and Re.compile (' (\d+\.\d+\.\d+) '). Search (lineelems[0): Print Linee Lems[0],lineelems[-1] Proxy_list.append ("%s:%s"% (lineelems[0],lineelems[-1)) mutex.release () try : If Os.path.isfile (htmlfilename): Os.remove (htmlfilename) Except:pass if __ name__ = = ' __main__ ': Try:if (len (SYS.ARGV)) < 2:print "usage:%s [Casperjs Path]"% (sys.argv[0 ]) Sys.exit (1) if not os.path.exists (Sys.argv[1]): print "Casperjs path:%s does not exist!"% ( SYS.ARGV[1]) Sys.exit (2) if Os.path.isfile (output_file): F = open (output_file) lines = F.readlines () F.close for line in Lines:PROXY_LIST.append (Line.strip ()) Thread1 = Site1 Thread (Sys.argv[1]) Thread1.start () Thread1.join () F = open (Output_file, ' W ') for proxy
In Set (proxy_list): F.write (proxy+ "\ n") f.close () print "done!" except Systemexit:pass except:errmsg = Traceback.format_exc () print errmsg log.
Error (ERRMSG)