Script memory and CPU footprint is very low!
Answer the question when my script has been completed 60%, yesterday morning overtime completed, using the Orc parsing verification code, specify only the analysis of numbers!
Before using the need to install a lot of libraries, if you need to orc identification, you need to install the system TESSERACT-OCR, the system is Linux in the words here should be able to find compiled a good package, old high Mac with a word on the brew install Tesseract.
Requirements.txt
ProgressBar = 2.3
Pyquery = = 1.2.9
Requests = = 2.4.3
Pillow = = 2.8.2
Optional:
Pytesseract = 0.1.6
Pip one-click installation dependencies
Pip install requests ProgressBar pyquery Pillow pytesseract
Test Platform
OS X CentOS
Python 2.6 2.7
How to use
Yunfile_downloader-u xxx-p/tmp/download-adb
-U Download Address
-A auto-upload (requires bypy coordination)
-B Background Download
-D Debug
-P Download Path (default current folder)
Background Download View Progress
Tail-f/tmp/yunfile.log
#!/usr/bin/env python
# Encoding:utf-8
"""
@version: 0.3
@author: Endoffiht
@file: yunfile_downloader.py
@time: 15/6/29 18:06
"""
Import requests
Import Httplib
Try
From Cstringio import Stringio
Except
From Stringio import Stringio
From pyquery import Pyquery as PQ
From Urlparse import Urlparse
From PIL import Image
Import re
Import CGI
From ProgressBar Import *
Import time
Import OS
Import Sys
Import getopt
def yun_download (URL, background=false, File_path=none, Debug=false, Auto=false):
# Display Header
If debug:
Patch_send ()
print ' Initiate requests.session '
Init ()
Download_link, Vcode_url = wait_page (URL)
print ' Requseting for Vcode '
Vcode = Get_vcode (Vcode_url, Download_link)
print ' Please wait 30s '
Download_link = download_link[:-5] + '/' + Vcode + '. html '
print ' Download_link with code-->%s '% Download_link
Wait (30)
print ' Begin download process '
If background:
Background_download (Download_link, File_path, Auto)
Else
Download_page (Download_link, File_path, Auto)
# first step, get to the next page link and verify code picture link
def wait_page (File_url):
R = S.get (File_url)
File_url = R.url
S.get (File_url + ' &dr= ')
D = PQ (R.text)
U = Urlparse (File_url)
Download_link = '. Join ((U.scheme, '://', U.netloc, D (' #downpage_link '). attr ("href"))
Vcode_url = '. Join ((U.scheme, '://', U.netloc, '/verifyimg/getpcv.html '))
Return Download_link, Vcode_url
def auto_upload (Dir_name, cmd= '/usr/local/bin/python/usr/bin/bypy.py upload '):
Exec_cmd = ' cd {0} && {1} '. Format (dir_name, cmd)
TMP = Os.popen (exec_cmd). Read ()
Print tmp
Sys.exit (0)
def wait (seconds):
For I in range (0, seconds-1):
If I% 5 = 0 or i > seconds-5:
Print Seconds-i
Time.sleep (1)
def background_download (link, file_path, auto):
Try
If Os.fork () > 0:
Sys.exit (0)
Except OSError, E:
print ' fork #1 failed:%d (%s) '% (E.errno, e.strerror)
Sys.exit (1)
Os.setsid ()
Os.umask (0)
Try
PID = Os.fork ()
If PID > 0:
Sys.exit (0)
Except OSError, E:
print ' fork #2 failed:%d (%s) '% (E.errno, e.strerror)
Sys.exit (1)
Sys.stdout.flush ()
Sys.stderr.flush ()
Out_filename = "/tmp/yunfile.log"
If Os.path.exists (out_filename):
Os.system (' cat/dev/null > ' + out_filename)
Else
Os.system (' touch ' + out_filename)
Si = File (out_filename, ' R ')
so = File (Out_filename, ' A + ')
SE = File (out_filename, ' A + ', 0)
Os.dup2 (Si.fileno (), Sys.stdin.fileno ())
Os.dup2 (So.fileno (), Sys.stdout.fileno ())
Os.dup2 (Se.fileno (), Sys.stderr.fileno ())
Download_page (link, file_path, auto)
Sys.exit (0)
Def patch_send ():
Old_send = Httplib. Httpconnection.send
def new_send (self, header):
print '-----start-----'
Print Header
print '-----End-----'
Return Old_send (self, header) # return isn't necessary, but never hurts, in case the library is changed
Httplib. Httpconnection.send = New_send
DEF init ():
Origin_url = ' http://www.yunfile.com '
Login_url = ' Http://www.yunfile.com/view '
headers = {' user-agent ': ' mozilla/5.0 ' (Macintosh; Intel Mac OS X 10_7_2) '
' Applewebkit/537.36 (khtml, like Gecko) chrome/27.0.1453.93 safari/537.36 ',
' Referer ': ' Http://www.baidu.com/link?url=yRbMCjHoOmVlf-cn9ef '
' Zre0vhjkaymuutkdd2o24lyizp2mrsvv_vfdfs4uiprc7&wd= '
' &eqid=cde1aa8f00001b72000000025587c4ff ',
' Connection ': ' Keep-alive ',
' Cache-control ': ' max-age=0 ',
' Accept ': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 ',
' accept-encoding ': ' gzip, deflate, SDCH ',
' Accept-language ': ' zh-cn,zh;q=0.8,en;q=0.6 '
}
Global S
s = requests.session ()
S.headers = Headers
S.get (Origin_url)
# Go to download page
def download_page (Download_link, File_path, Auto):
R = S.get (Download_link)
# need to set Referer
s.headers[' Referer '] = Download_link
If not r.history:
# need to access these two URLs and may get a new cookie
URLs = Re.findall (R ' http://www.yunfile.com/ckcounter.jsp[^ ']* ', R.text)
For URL in URLs:
S.get (URL)
data = {}
# It's easy to get a form with Pyquery
D = PQ (R.text)
Action = d (' #d_down_from '). attr (' action ')
For x in D (' #d_down_from input '):
x = PQ (x)
Key = x.attr (' name ')
If key:
Value = x.attr (' value ')
Data[key] = value
# The following two variables are hidden in the JS script.
data[' vid '] = Re.search (R ' var vericode = "(\w+)" ', R.text). Group (1)
data[' Fileid '] = Re.search (r ' Fileid\.value = "(\w+)"; ", R.text). Group (1)
# Post get file, stream needs to be opened
R = S.post (action, data, Stream=true)
# get filename
Try
Value, params = Cgi.parse_header (r.headers[' content-disposition '))
file_name = params[' filename ']
If File_path:
# Judgment Path exists
If not os.path.exists (File_path):
Os.makedirs (File_path)
If file_path[-1]!= '/':
File_path + = '/'
Real_path = File_path + file_name
Else
Real_path = file_name
print ' Start downloading ' + Real_path
# Initialize progress bar
print '
Total = Int (r.headers[' content-length '])
Widgets = [' Downloading ' + file_name, percentage (), ', Bar (Marker=rotatingmarker ()),
", ETA (),", Filetransferspeed ()]
Pbar = ProgressBar (Widgets=widgets, Maxval=total). Start ()
# Write files
With open (Real_path, ' WB ') as FD:
Progress = 0
For chunk in R.iter_content (1024):
progress = Len (chunk)
Fd.write (Chunk)
Pbar.update (Progress)
Pbar.finish ()
print ' Download complete '
If Auto:
print ' Start upload '
Auto_upload (File_path)
Else
Sys.exit (0)
Except Exception, E:
Print S.headers
Else
print ' Error when downloading '
Print S.headers
Sys.exit (2)
# get Authenticode
def get_vcode (Vcode_url, refer):
# need to set Referer
s.headers[' Referer ' = refer
flag = True
while (flag):
r = S.get (vcode_url)
m_image = Image.open (Stringio (r.content))
image_to_ascii (m_image)
# If you do not have pytesseract or are not ready to install, you will automatically skip the orc link
Try:
import pytesseract
Guess_code = re.sub (R ' [\d] ', ', pytesseract.image_to_ String (m_image, config= ' digits '))
except:
Guess_code = None
# Enter confirm N Refresh verification code
If not Guess_code or Len (guess_code)!= 4:
Ask = ' Please tell me the code------> '
Else
Ask = "Vcode = =%s?" Press ENTER to Confirm,n for refresh or tell me------> "% guess_code
Code = raw_input (ASK)
If not code:
Break
Elif code.lower () = = ' n ':
Continue
Else
Guess_code = code
Flag = False
Return Guess_code
# Picture turn to ASCII, http://a-eter.blogspot.com/2010/04/image-to-ascii-art-in-python.html
def image_to_ascii (image):
Ascii_chars = [' # ', ' A ', ' @ ', '% ', ' S ', ' + ', ' < ', ' * ', ': ', ', ', ', ', ', ', '. ']
def image_transfer (image):
image_as_ascii = []
all_pixels = List (Image.getdata ())
for Pixel_value in All_pixels:
index = pixel_value/25 # 0-10
Image_as_ascii.append (Ascii_chars[index])
return Image_as_ascii
width, heigth = image.size
New_width = 100
new_heigth = Int ((HEIGTH * new_width)/width)
New_image = Image.resize ((new_width, new_heigth))
New_image = New_image.convert ("L") # Convert to Grayscale
Img_as_ascii = Image_transfer (new_image)
Img_as_ascii = '. Join (CH for ch in img_as_ascii)
For C in range (0, Len (img_as_ascii), new_width):
Print Img_as_ascii[c:c + new_width]
# User Login, pending development
def login ():
Pass
def main (Argv=none):
short_opts = ' U:p:dba '
Try
Optlist, args = Getopt.getopt (sys.argv[1:], short_opts)
Except Getopt. Getopterror, E:
Print_help ()
Sys.exit (2)
Config = Dict ()
config[' Debug ' = False
config[' backgroud ' = False
config[' path ' = None
config[' auto_upload ' = False
For K, v. in Optlist:
if k = = '-U ':
config[' url ' = V
if k = = ' d ':
config[' Debug ' = True
if k = = ' B ':
config[' backgroud '] = True
if k = = '-P ':
config[' path ' = V
if k = = '-A ':
config[' auto_upload '] = True
If ' url ' not in config:
Print_help ()
Sys.exit (2)
Yun_download (config[' url '), config[' Backgroud '], config[' path ', config[' Debug '], config[' auto_upload ']
Def print_help ():
print ' Help '
if __name__ = = ' __main__ ':
Main ()