Yunfile file download script in Linux system

Source: Internet
Author: User
Tags flush

Script memory and CPU footprint is very low!


Answer the question when my script has been completed 60%, yesterday morning overtime completed, using the Orc parsing verification code, specify only the analysis of numbers!

Before using the need to install a lot of libraries, if you need to orc identification, you need to install the system TESSERACT-OCR, the system is Linux in the words here should be able to find compiled a good package, old high Mac with a word on the brew install Tesseract.

Requirements.txt

ProgressBar = 2.3
Pyquery = = 1.2.9
Requests = = 2.4.3
Pillow = = 2.8.2
Optional:

Pytesseract = 0.1.6
Pip one-click installation dependencies

Pip install requests ProgressBar pyquery Pillow pytesseract
Test Platform

OS X CentOS
Python 2.6 2.7

How to use

Yunfile_downloader-u xxx-p/tmp/download-adb

-U Download Address
-A auto-upload (requires bypy coordination)
-B Background Download
-D Debug
-P Download Path (default current folder)

Background Download View Progress

Tail-f/tmp/yunfile.log
#!/usr/bin/env python
# Encoding:utf-8


"""
@version: 0.3
@author: Endoffiht
@file: yunfile_downloader.py
@time: 15/6/29 18:06
"""

Import requests
Import Httplib

Try
From Cstringio import Stringio
Except
From Stringio import Stringio
From pyquery import Pyquery as PQ
From Urlparse import Urlparse
From PIL import Image
Import re
Import CGI
From ProgressBar Import *
Import time
Import OS
Import Sys
Import getopt


def yun_download (URL, background=false, File_path=none, Debug=false, Auto=false):
# Display Header
If debug:
Patch_send ()
print ' Initiate requests.session '
Init ()
Download_link, Vcode_url = wait_page (URL)
print ' Requseting for Vcode '
Vcode = Get_vcode (Vcode_url, Download_link)

print ' Please wait 30s '

Download_link = download_link[:-5] + '/' + Vcode + '. html '
print ' Download_link with code-->%s '% Download_link

Wait (30)

print ' Begin download process '

If background:
Background_download (Download_link, File_path, Auto)
Else
Download_page (Download_link, File_path, Auto)


# first step, get to the next page link and verify code picture link
def wait_page (File_url):
R = S.get (File_url)
File_url = R.url
S.get (File_url + ' &dr= ')
D = PQ (R.text)
U = Urlparse (File_url)
Download_link = '. Join ((U.scheme, '://', U.netloc, D (' #downpage_link '). attr ("href"))
Vcode_url = '. Join ((U.scheme, '://', U.netloc, '/verifyimg/getpcv.html '))
Return Download_link, Vcode_url


def auto_upload (Dir_name, cmd= '/usr/local/bin/python/usr/bin/bypy.py upload '):
Exec_cmd = ' cd {0} && {1} '. Format (dir_name, cmd)
TMP = Os.popen (exec_cmd). Read ()
Print tmp
Sys.exit (0)


def wait (seconds):
For I in range (0, seconds-1):
If I% 5 = 0 or i > seconds-5:
Print Seconds-i
Time.sleep (1)


def background_download (link, file_path, auto):
Try
If Os.fork () > 0:
Sys.exit (0)
Except OSError, E:
print ' fork #1 failed:%d (%s) '% (E.errno, e.strerror)
Sys.exit (1)

Os.setsid ()
Os.umask (0)

Try
PID = Os.fork ()
If PID > 0:
Sys.exit (0)
Except OSError, E:
print ' fork #2 failed:%d (%s) '% (E.errno, e.strerror)
Sys.exit (1)

Sys.stdout.flush ()
Sys.stderr.flush ()
Out_filename = "/tmp/yunfile.log"

If Os.path.exists (out_filename):
Os.system (' cat/dev/null > ' + out_filename)
Else
Os.system (' touch ' + out_filename)
Si = File (out_filename, ' R ')
so = File (Out_filename, ' A + ')
SE = File (out_filename, ' A + ', 0)
Os.dup2 (Si.fileno (), Sys.stdin.fileno ())
Os.dup2 (So.fileno (), Sys.stdout.fileno ())
Os.dup2 (Se.fileno (), Sys.stderr.fileno ())

Download_page (link, file_path, auto)
Sys.exit (0)


Def patch_send ():
Old_send = Httplib. Httpconnection.send

def new_send (self, header):
print '-----start-----'
Print Header
print '-----End-----'

Return Old_send (self, header) # return isn't necessary, but never hurts, in case the library is changed

Httplib. Httpconnection.send = New_send




DEF init ():


Origin_url = ' http://www.yunfile.com '


Login_url = ' Http://www.yunfile.com/view '


headers = {' user-agent ': ' mozilla/5.0 ' (Macintosh; Intel Mac OS X 10_7_2) '


' Applewebkit/537.36 (khtml, like Gecko) chrome/27.0.1453.93 safari/537.36 ',


' Referer ': ' Http://www.baidu.com/link?url=yRbMCjHoOmVlf-cn9ef '


' Zre0vhjkaymuutkdd2o24lyizp2mrsvv_vfdfs4uiprc7&wd= '


' &eqid=cde1aa8f00001b72000000025587c4ff ',


' Connection ': ' Keep-alive ',


' Cache-control ': ' max-age=0 ',


' Accept ': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 ',


' accept-encoding ': ' gzip, deflate, SDCH ',


' Accept-language ': ' zh-cn,zh;q=0.8,en;q=0.6 '


}


Global S


s = requests.session ()


S.headers = Headers


S.get (Origin_url)


# Go to download page
def download_page (Download_link, File_path, Auto):
R = S.get (Download_link)
# need to set Referer
s.headers[' Referer '] = Download_link
If not r.history:
# need to access these two URLs and may get a new cookie
URLs = Re.findall (R ' http://www.yunfile.com/ckcounter.jsp[^ ']* ', R.text)

For URL in URLs:
S.get (URL)

data = {}

# It's easy to get a form with Pyquery
D = PQ (R.text)
Action = d (' #d_down_from '). attr (' action ')

For x in D (' #d_down_from input '):
x = PQ (x)
Key = x.attr (' name ')
If key:
Value = x.attr (' value ')
Data[key] = value

# The following two variables are hidden in the JS script.
data[' vid '] = Re.search (R ' var vericode = "(\w+)" ', R.text). Group (1)
data[' Fileid '] = Re.search (r ' Fileid\.value = "(\w+)"; ", R.text). Group (1)

# Post get file, stream needs to be opened
R = S.post (action, data, Stream=true)

# get filename
Try
Value, params = Cgi.parse_header (r.headers[' content-disposition '))
file_name = params[' filename ']

If File_path:
# Judgment Path exists
If not os.path.exists (File_path):
Os.makedirs (File_path)
If file_path[-1]!= '/':
File_path + = '/'

Real_path = File_path + file_name


Else


Real_path = file_name


print ' Start downloading ' + Real_path


# Initialize progress bar


print '


Total = Int (r.headers[' content-length '])


Widgets = [' Downloading ' + file_name, percentage (), ', Bar (Marker=rotatingmarker ()),


", ETA (),", Filetransferspeed ()]


Pbar = ProgressBar (Widgets=widgets, Maxval=total). Start ()

# Write files


With open (Real_path, ' WB ') as FD:


Progress = 0


For chunk in R.iter_content (1024):


progress = Len (chunk)


Fd.write (Chunk)


Pbar.update (Progress)


Pbar.finish ()


print ' Download complete '


If Auto:


print ' Start upload '


Auto_upload (File_path)


Else


Sys.exit (0)

Except Exception, E:
Print S.headers
Else
print ' Error when downloading '
Print S.headers
Sys.exit (2)


# get Authenticode
def get_vcode (Vcode_url, refer):
    # need to set Referer
    s.headers[' Referer ' = refer
    flag = True
    while (flag):
         r = S.get (vcode_url)
        m_image = Image.open (Stringio (r.content))
        image_to_ascii (m_image)
         # If you do not have pytesseract or are not ready to install, you will automatically skip the orc link
         Try:
            import pytesseract
             Guess_code = re.sub (R ' [\d] ', ', pytesseract.image_to_ String (m_image, config= ' digits '))
        except:
             Guess_code = None

# Enter confirm N Refresh verification code
If not Guess_code or Len (guess_code)!= 4:
Ask = ' Please tell me the code------> '
Else
Ask = "Vcode = =%s?" Press ENTER to Confirm,n for refresh or tell me------> "% guess_code

Code = raw_input (ASK)

If not code:
Break
Elif code.lower () = = ' n ':
Continue
Else
Guess_code = code
Flag = False

Return Guess_code


# Picture turn to ASCII, http://a-eter.blogspot.com/2010/04/image-to-ascii-art-in-python.html
def image_to_ascii (image):
Ascii_chars = [' # ', ' A ', ' @ ', '% ', ' S ', ' + ', ' < ', ' * ', ': ', ', ', ', ', ', ', '. ']

    def image_transfer (image):
        image_as_ascii = []
        all_pixels = List (Image.getdata ())
         for Pixel_value in All_pixels:
             index = pixel_value/25  # 0-10
            Image_as_ascii.append (Ascii_chars[index])
        return Image_as_ascii

width, heigth = image.size
New_width = 100
new_heigth = Int ((HEIGTH * new_width)/width)
New_image = Image.resize ((new_width, new_heigth))
New_image = New_image.convert ("L") # Convert to Grayscale
Img_as_ascii = Image_transfer (new_image)
Img_as_ascii = '. Join (CH for ch in img_as_ascii)
For C in range (0, Len (img_as_ascii), new_width):
Print Img_as_ascii[c:c + new_width]


# User Login, pending development
def login ():
Pass


def main (Argv=none):
short_opts = ' U:p:dba '
Try
Optlist, args = Getopt.getopt (sys.argv[1:], short_opts)
Except Getopt. Getopterror, E:
Print_help ()
Sys.exit (2)

Config = Dict ()
config[' Debug ' = False
config[' backgroud ' = False
config[' path ' = None
config[' auto_upload ' = False

For K, v. in Optlist:
if k = = '-U ':
config[' url ' = V
if k = = ' d ':
config[' Debug ' = True
if k = = ' B ':
config[' backgroud '] = True
if k = = '-P ':
config[' path ' = V
if k = = '-A ':
config[' auto_upload '] = True

If ' url ' not in config:
Print_help ()
Sys.exit (2)

Yun_download (config[' url '), config[' Backgroud '], config[' path ', config[' Debug '], config[' auto_upload ']


Def print_help ():
print ' Help '


if __name__ = = ' __main__ ':
Main ()

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.