Abstract: In fact, there is no direct relationship between Casperjs and Python, mainly rely on Casperjs call Phantomjs webkit get HTML file content. For a long time, crawling the HTML pages generated by the client-side JavaScript rendering is extremely difficult, with Htmlunit in Java, and in Python we can use a standalone cross-platform Casperjs .
create Site.js (interface file, input: URL, output: HTML file)
usage: e:\toolkit\n1k0-casperjs-e3a77d0\bin>python casperjs site.js --url=http:// Spys.ru/free-proxy-list/ie/ --outputfile= ' temp.html ' Var fs = require (' FS '); var casper = require (' Casper '). Create ({ pagesettings: { loadimages: false, loadPlugins: false, useragent: ' mozilla/5.0 (windows nt 6.1) AppleWebKit/537.36 (khtml, like gecko) chrome/34.0.1847.137 safari/537.36 lbbrowser '},loglevel: "Debug",// Log level verbose: true // logging to console });var url = Casper.cli.raw.get (' url '); Var outputfile = casper.cli.raw.get (' outputfile ');//Request Page Casper.start ( url, function () {fs.write (outputfile, this.gethtml (), ' W ');}); Casper.run ();
Python code, checkout_proxy.py
Import jsonimport sys#import requests#import requests.utils, picklefrom bs4 import BeautifulSoupimport os.path,osimport threading#from multiprocessing import process, managerfrom datetime import datetimeimport tracebackimport loggingimport re,randomimport subprocessimport shutilimport platformoutput_file = os.path.join (Os.path.dirname (Os.path.realpath (__file__)), ' Proxy.txt ') global_log = ' Http_proxy ' + datetime.now (). Strftime ('%y-%m-%d ') + '. Log ' if not os.path.exists (Os.path.join (Os.path.dirname (Os.path.realpath (__file__)), ' logs '): os.mkdir (Os.path.join ( Os.path.dirname (Os.path.realpath (__file__)), ' logs ') Global_log = os.path.join (Os.path.dirname ( Os.path.realpath (__file__)), ' logs ', Global_log) logging.basicconfig (level=logging. debug,format= ' [% (asctime) s] [% (levelname) s] [% (module) S]&NBsp [% (FuncName) s] [% (Lineno) d] % (message) s ', filename=global_log,filemode= ' a ') log = Logging.getlogger (__name__) #manager = manager () #PROXY_LIST = manager.list () mutex = threading. Lock () proxy_list = []def iswindows (): if "Windows" in str ( Platform.uname ()): return true else: return falsedef Gettagsbyattrs (Tagname,pagecontent,attrname,attrregvalue): soup = beautifulsoup ( PageContent) return soup.find_all (tagname, { attrname : Re.compile (Attrregvalue) }) Def gettagsbyattrsext (Tagname,filename,attrname,attrregvalue): if os.path.isfile (filename): f = open (filename, ' r ') Soup = beautifulsoup (f) f.close () return soup.find_all (tagName, { attrname : re.compile(Attrregvalue) }) else: return noneclass site1thread (threading. Thread): def __init__ (Self,outputfilepath): threading. Thread.__init__ (self) self.outputFilePath = outputFilePath self.fileName = str (Random.randint (100,1000)) + ". html" self.setname (' Site1thread ') def run (self): site1_file = os.path.join (Os.path.dirname (Os.path.realpath (__file__)), ' Site.js ') site2_file = os.path.join (Self.outputfilepath, ' site.js ') if not os.path.isfile (Site2_file) and os.path.isfile (site1_file): shutil.copy ( Site1_file,site2_file) #proc = subprocess. Popen (["Bash", "-C", "cd %s && ./casperjs site.js --url=http://spys.ru/ free-proxy-list/ie/ --outputfile=%s " % (Self.outputfilepath,self.filename) ],stdout=subprocess. PIPE) if iswindows (): proc = subprocess. Popen (["cmd", "/C", "%s/casperjs site.js --url=http://spys.ru/free-proxy-list/ie/ -- outputfile=%s " % (self.outputfilepath,self.filename) ],stdout=subprocess. PIPE) else: proc = subprocess. Popen (["Bash", "-C", "cd %s && ./casperjs site.js --url=http://spys.ru/ free-proxy-list/ie/ --outputfile=%s " % (self.outputfilepath,self.filename) ],stdout= Subprocess. PIPE) out=proc.communicate () [0] htmlfilename = " # Because the output path is not deterministic in Windows, all possible paths are added here to Judge if os.path.isfile (Self.filename): Htmlfilename = self.filename elif os.path.isfile (Os.path.join (Self.outputfilepath, Self.filename)): htmlfilename = os.path.join (self.outPutfilepath,self.filename) elif os.path.isfile (Os.path.join (Os.path.dirname (Os.path.realpath (_ _file__)) (Self.filename)): htmlfilename = os.path.join (Os.path.dirname ( Os.path.realpath (__file__)), Self.filename) if (Not os.path.isfile (htmlfilename)): print ' failed to get html content from http://spys.ru/ free-proxy-list/ie/' print out sys.exit (3) Mutex.acquire () proxylist= gettagsbyattrsext (' Font ', Htmlfilename, ' class ', ' spy14$ ') For proxy in proxylist: tdcontent = proxy.rendercontents () lineelems = re.split (' [<>] ', tdcontent) if Re.compile (R ' \d+ '). Search (Lineelems[-1]) and re.compile (' (\d+\.\d+\.\d+) '). Search (Lineelems[0]): print Lineelems[0],lineelems[-1] proxy_list.append ("%s:%s" % (lineelems[0), Lineelems[-1]) mutex.release () try: if os.path.isfile ( Htmlfilename): os.remove (htmlfilename) except: passif __name__ == ' __main__ ': try: if (len (sys.argv)) < 2: print "Usage:%s [casperjs path" % (sys.argv[0]) sys.exit (1) if not os.path.exists (sys.argv[1]): print " Casperjs path: %s does not exist! " % (Sys.argv[1]) sys.exit (2) if os.path.isfile (output_file): f = open (output_file) lines = f.readlines () f.close for line in&Nbsp;lines: proxy_list.append (Line.strip ()) thread1 = site1thread ( SYS.ARGV[1]) thread1.start () thread1.join () f = open (Output_file, ' W ') for proxy in set (proxy_list): f.write (proxy+ "\ n") f.close () print "done!" except systemexit: pass except: errmsg = traceback.format_exc () print errmsg log.error (ERRMSG)
Use Casperjs to get HTML content generated by JavaScript rendering