Abstract: In fact, there is no direct relationship between Casperjs and Python, mainly rely on Casperjs call Phantomjs webkit get HTML file content. For a long time, crawling the HTML pages generated by the client-side JavaScript rendering is extremely difficult, with Htmlunit in Java, and in Python we can use a standalone cross-platform Casperjs.
Create a Site.js (interface file, input: URL, output: HTML file)
Usage:e:\toolkit\n1k0-casperjs-e3a77d0\bin>python Casperjs site.js--url=http://spys.ru/free-proxy-list/ie/- -outputfile= ' temp.html ' var fs = require (' FS '); var casper = require (' Casper '). Create ({ pagesettings: { loadimages:false, loadplugins:false, UserAgent: ' mozilla/5.0 (Windows NT 6.1) applewebkit/537.36 (khtml, like Gecko) chrome/34.0.1847.137 safari/537.36 LBBROW SER ' }, logLevel: "Debug",//log level verbose:true //Log to console }); var url = casper.cli.raw.get (' url '); var outputfile = casper.cli.raw.get (' outputfile '); Request page casper.start (URL, function () { fs.write (outputfile, this.gethtml (), ' W '); });
Python code, checkout_proxy.py
Import JSON import sys #import requests #import requests.utils, pickle from BS4 import BeautifulSoup Import Os.path,os Import Threading #from multiprocessing import Process, Manager from datetime import datetime Import traceback Import logging import re,random import subprocess import shutil Import Platform Output_file = Os.path.join (Os.path.dirname (Os.path.realpath (__file__)), ' Proxy.txt ') Global_lo g = ' http_proxy ' + DateTime.Now (). Strftime ('%y-%m-%d ') + '. Log ' if not os.path.exists (Os.path.join (Os.path.dirname (OS. Path.realpath (__file__)), ' logs '): Os.mkdir (Os.path.join (Os.path.dirname (Os.path.realpath), ' __file__ ')) GL Obal_log = Os.path.join (Os.path.dirname (Os.path.realpath (__file__)), ' logs ', Global_log) logging.basicconfig (level= Logging. debug,format= ' [% (asctime) s] [% (LevelName) s] [% (module) s] [% (FuncName) s] [% (Lineno) d]% (message) s ', Filename=global_ Log,filemode= ' a ') Log = Logging.getlogger (__name__) #manager = Manager () #PROXY_LIST = Manager.list () Mutex = threading. Lock () proxy_list = [] def iswindows (): If "Windows" in Str (Platform.uname ()): Return True Else:return False def gettagsbyattrs (tagname,pagecontent,attrname,attrregvalue): soup = be Autifulsoup (PageContent) return Soup.find_all (TagName, {attrname: Re.compile (Attrregvalue)}) def gettagsbyattrsext (tagname,filename,attrname,attrregvalue): if Os.path. Isfile (filename): f = open (filename, ' r ') soup = BeautifulSoup (f) f.close () return Soup.find_all (TagName, {attrName:re.compile (Attrregvalue)}) Else:return None class Site1thread (threading . Thread): Def __init__ (Self,outputfilepath): Threading. Thread.__init__ (self) self.outputfilepath = Outputfilepath Self.filename = STR (Random.randint (100,1000)) + ". html" self.setname (' Site1thread ') def run (self): Site1_file = O S.path.join (Os.path.dirname (Os.path.realpath (__file__)), ' site.js ') Site2_file = Os.path.join (Self.outputfilepath, ' Site.js ') if not os.path.isfile (Site2_file) and Os.path.isfile (site1_file): Shutil.copy (Site1_file,site2_fi Le) #proc = subprocess. Popen (["Bash", "-C", "CD%s &&./casperjs site.js--url=http://spys.ru/free-proxy-list/ie/--outputfile=%s"% ( Self.outputfilepath,self.filename)],stdout=subprocess. PIPE) if Iswindows (): proc = subprocess. Popen (["cmd", "/C", "%s/casperjs site.js--url=http://spys.ru/free-proxy-list/ie/--outputfile=%s"% ( Self.outputfilepath,self.filename)],stdout=subprocess. PIPE) Else:proc = subprocess. Popen (["Bash", "-C", "CD%s &&./casperjs site.js--url=http://spys.ru/free-proxy-list/ie/--outputfile=%s"% ( Self.outputfilepath,self.filename)],stdout=subprocess. PIPE) Out=proc.communicate () [0] htmlfilename = ' #因为输出路径在windows不确定, so this adds all possible paths to determine if Os.path.isfile (sel F.filename): Htmlfilename = Self.filename elif os.path.isfile (Os.path.join (self.outputfilepath,self.filename ): Htmlfilename = Os.path.join (self.outputfilepath,self.filename) elif os.path.isfile (Os.path.join (Os.path). DirName (Os.path.realpath (__file__)), Self.filename)): Htmlfilename = Os.path.join (Os.path.dirname (Os.path.realpath (__file__)), self.filename) if (not Os.path.isfile (htmlfilename)): print ' Failed-get HTML content from HT tp://spys.ru/free-proxy-list/ie/' Print out Sys.exit (3) mutex.acquire () proxylist= Gettagsbya Ttrsext (' Font ', Htmlfilename, ' class ', ' spy14$ ') for proxy in proxylist:tdcontent = Proxy.rendercontents () Lineelems = Re.split (' [<>] ', tdcontent) if Re.compile (R ' \d+ '). Search (Lineelems[-1]) and Re.compile (' (\d+ \.\d+\.\d+) '). Search (lineElems[0]): Print lineelems[0],lineelems[-1] proxy_list.append ("%s:%s"% (Lineelems[0],lineelems[-1])) Mutex.release () try:if os.path.isfile (htmlfilename): Os.remove (htmlfilename) except: Pass if __name__ = = ' __main__ ': Try:if (len (SYS.ARGV)) < 2:print "usage:%s [Casperjs PA TH] "% (Sys.argv[0]) sys.exit (1) if not os.path.exists (Sys.argv[1]): print" Casperjs path:%s does Not exist! "% (Sys.argv[1]) Sys.exit (2) if Os.path.isfile (output_file): F = open (Output_file) lines = F.readlines () f.close for line in Lines:PROXY_LIST.append (Line.strip ()) Thread1 = Site1thread (sys.argv[1]) Thread1.start () Thread1.join () F = open (Output_file, ' W ') for Pro XY in Set (proxy_list): F.write (proxy+ "\ n") f.close () print "done!" Except Systemexit:pass except:eRrmsg = Traceback.format_exc () print errmsg log.error (errmsg)