PyV8 is the JavaScript engine embedded in chromium, which is known as the fastest running. PyV8 is using Python to wrap a python shell in V8 's external API, which allows Python to operate directly with JavaScript.
The first is the normal request page, return the HTML with the encrypted JS function:
Import re
Import PyV8
Import requests
Target_url = "http://www.kuaidaili.com/proxylist/1/"
def gethtml (URL, cookie=none):
Header = {
"Host": "Www.kuaidaili.com",
' Connection ': ' Keep-alive ',
' Cache-control ': ' max-age=0 ',
' Upgrade-insecure-requests ': ' 1 ',
' User-agent ': ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/49.0.2623.87 safari/537.36 ',
' Accept ': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 ',
' accept-encoding ': ' gzip, deflate, SDCH ',
' Accept-language ': ' zh-cn,zh;q=0.8 ',
}
html = Requests.get (Url=url, Headers=header, timeout=30, Cookies=cookie). Content
return HTML
# first time access to get dynamic encrypted JS
first_html = gethtml (Target_url)
Because the return is HTML, not simple JS function, so need to use regular extraction parameters of the JS function parameter.
Return content for the first time
# Extract the JS encryption function
Js_func = '. Join (Re.findall (function. *?) </script> ', first_html))
print ' Get JS func:\n ', Js_func
# Extract the parameters where the JS function is executed
Js_arg = ". Join (Re.findall (R ' settimeout\ (\" \d+\ ((\d+) \) \ "', first_html))
print ' Get ja arg:\n ', js_arg
It is also important to note that the JS function does not return a cookie, but instead directly sets the cookie to the browser, so we need to eval ("Qo=eval;qo (PO);") Replace with return PO. This will successfully return the contents of the PO.
def executejs (Js_func_string, ARG):
Ctxt = Pyv8.jscontext ()
Ctxt.enter ()
Func = Ctxt.eval ("({JS})". Format (js=js_func_string))
return func (ARG)
# Modify the JS function to return the cookie content
Js_func = Js_func.replace (' eval ("Qo=eval;qo (PO);") ', ' Return po ')
# Execute JS to get cookie
Cookie_str = Executejs (Js_func, Js_arg)
The cookie returned is in the form of a string, but the dictionary is used in Requests.get (), so it is converted to a dictionary:
def parsecookie (String):
String = String.Replace ("document.cookie=", "" ")
Clearance = String.Split (';') [0]
return {clearance.split (' = ') [0]: clearance.split (' = ') [1]}
# Convert cookies to dictionary format
Cookie = Parsecookie (COOKIE_STR)
Finally, take the parsed cookie and visit the webpage again to obtain the data successfully:
# Bring a cookie to access the URL again to get the correct data
Print gethtml (target_url, cookie) [0:500]
Here's the full code:
#-*-Coding:utf-8-*-
Import re
Import PyV8
Import requests
Target_url = "http://www.kuaidaili.com/proxylist/1/"
def gethtml (URL, cookie=none):
Header = {
"Host": "Www.kuaidaili.com",
' Connection ': ' Keep-alive ',
' Cache-control ': ' max-age=0 ',
' Upgrade-insecure-requests ': ' 1 ',
' User-agent ': ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/49.0.2623.87 safari/537.36 ',
' Accept ': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 ',
' accept-encoding ': ' gzip, deflate, SDCH ',
' Accept-language ': ' zh-cn,zh;q=0.8 ',
}
html = Requests.get (Url=url, Headers=header, timeout=30, Cookies=cookie). Content
return HTML
def executejs (Js_func_string, ARG):
Ctxt = Pyv8.jscontext ()
Ctxt.enter ()
Func = Ctxt.eval ("({JS})". Format (js=js_func_string))
return func (ARG)
def parsecookie (String):
String = String.Replace ("document.cookie=", "" ")
Clearance = String.Split (';') [0]
return {clearance.split (' = ') [0]: clearance.split (' = ') [1]}
# first time access to get dynamic encrypted JS
first_html = gethtml (Target_url)
# first_html = "" "
# # """
# Extract the JS encryption function
Js_func = '. Join (Re.findall (function. *?) </script> ', first_html))
print ' Get JS func:\n ', Js_func
# Extract the parameters where the JS function is executed
Js_arg = ". Join (Re.findall (R ' settimeout\ (\" \d+\ ((\d+) \) \ "', first_html))
print ' Get ja arg:\n ', js_arg
# Modify the JS function to return the cookie content
Js_func = Js_func.replace (' eval ("Qo=eval;qo (PO);") ', ' Return po ')
# Execute JS to get cookie
Cookie_str = Executejs (Js_func, Js_arg)
# Convert cookies to dictionary format
Cookie = Parsecookie (COOKIE_STR)
Print Cookie
# Bring a cookie to access the URL again to get the correct data
Print gethtml (target_url, cookie) [0:500]
Use the PyV8 module to hack website encryption cookies