A summary of how Python implements multithreaded download files

Source: Internet
Author: User
Tags flush ranges

Today I took a look at some of the multiple threads download files that I just learned when I was a python.

To achieve a simple multithreaded download, you need to focus on the following points:
1. File size: Can be extracted from the reponse header, such as "content-length:911" indicates that the size is 911 bytes
2. Task split: Specifies which pieces of files are downloaded by each thread, and you can add "range:bytes=300-400" to the request header (which means the content of the download 300~400byte), and note that the Range of files that can be requested is [0, size-1 Bytes.
3. Aggregation of downloaded files: Each thread saves its own downloaded block of files as a temporary file, all threads are completed, and then the temporary files are aggregated into the final file in order.

On the internet to see a good multithreaded Download file example, is based on the Linux multithreaded download tool Axel thought to write, the source code in: http://fayaa.com/code/view/58/full/
I've included it, and the following needs can be modified based on it, and the sample code is as follows:
https://github.com/smilejay/python/blob/master/py2014/paxel.py

The code is as follows Copy Code
#!/usr/bin/python


#-*-Coding:utf-8-*-


# filename:paxel.py


# from:http://fayaa.com/code/view/58/full/


# Jay modified it a little and save for further potential usage.





' It is a multi-thread downloading tool





It was developed following Axel.


Author:volans


E-MAIL:VOLANSW [at] gmail.com


'''





Import Sys


Import OS


Import time


Import Urllib


From threading Import Thread





# in the case for your want to use Http_proxy


local_proxies = {' http ': ' http://131.139.58.200:8080 '}








Class Axelpython (Thread, Urllib. Fancyurlopener):


"" Multi-thread downloading class.





Run () is a vitural method of Thread.


'''


def __init__ (self, threadname, url, filename, ranges=0, proxies={}):


Thread.__init__ (self, name=threadname)


Urllib. Fancyurlopener.__init__ (self, proxies)


Self.name = ThreadName


Self.url = URL


Self.filename = filename


Self.ranges = Ranges


self.downloaded = 0





def run (self):


"' vertual function in Thread '"


Try


self.downloaded = Os.path.getsize (self.filename)


Except OSError:


#print ' never downloaded '


self.downloaded = 0





# Rebuild Start Poind


Self.startpoint = self.ranges[0] + self.downloaded





# This is completed


If Self.startpoint >= self.ranges[1]:


Print ' Part%s has been downloaded over. '% Self.filename


Return





Self.onetimesize = 16384 # 16kbyte/time


print ' task%s ' download from%d to%d '% (Self.name, Self.startpoint, self.ranges[1])





Self.addheader ("Range", "bytes=%d-%d"% (Self.startpoint, self.ranges[1))


Self.urlhandle = Self.open (Self.url)





data = Self.urlhandle.read (self.onetimesize)


While data:


FileHandle = open (Self.filename, ' ab+ ')


Filehandle.write (data)


Filehandle.close ()





self.downloaded = Len (data)


#print '%s '% (self.name)


#progress = U ' r ... '





data = Self.urlhandle.read (self.onetimesize)








def geturlfilesize (URL, proxies={}):


Urlhandler = Urllib.urlopen (URL, proxies=proxies)


headers = Urlhandler.info (). Headers


Length = 0


For header in headers:


If Header.find (' Length ')!=-1:


Length = Header.split (': ') [ -1].strip ()


length = Int (length)


return length








def spliteblocks (TotalSize, Blocknumber):


BlockSize = Totalsize/blocknumber


ranges = []


For I in range (0, blocknumber-1):


Ranges.append ((i * blocksize, I * blocksize + blocksize-1))


Ranges.append ((BlockSize * (blocknumber-1), totalsize-1))





return ranges








def islive (Tasks):


For task in tasks:


If Task.isalive ():


Return True


Return False








def paxel (URL, output, blocks=6, proxies=local_proxies):


"' Paxel


'''


size = geturlfilesize (URL, proxies)


Ranges = spliteblocks (size, blocks)





ThreadName = ["thread_%d"% i for I in range (0, blocks)]


filename = ["tmpfile_%d"% i for I in range (0, blocks)]





tasks = []


For I in range (0, blocks):


Task = Axelpython (threadname[i], URL, filename[i], ranges[i])


Task.setdaemon (True)


Task.start ()


Tasks.append (Task)





Time.sleep (2)


While islive (tasks):


downloaded = SUM ([task.downloaded for task in tasks])


Process = downloaded/float (size) * 100


Show = U ' rfilesize:%d downloaded:%d completed:%.2f%% '% (size, downloaded, process)


Sys.stdout.write (show)


Sys.stdout.flush ()


Time.sleep (0.5)





FileHandle = open (output, ' wb+ ')


For i in FileName:


f = open (I, ' RB ')


Filehandle.write (F.read ())


F.close ()


Try


Os.remove (i)


Pass


Except


Pass





Filehandle.close ()





if __name__ = = ' __main__ ':


url = ' HTTP://DLDIR1.QQ.COM/QQFILE/QQFORMAC/QQ_V3.1.1.DMG '


Output = ' download.file '


Paxel (URL, output, blocks=4, proxies={})




HTTPS user name password Authentication added:

Multi-threaded downloads are considered when you need to download HTTPS with SSL-authenticated files and often have a single process card dead

The code is as follows Copy Code
#-*-Coding:utf8-*-


Import OS


Import Getpass,urllib2,sys


Import Cookielib


Import Threading,thread


URL = Your Https://url


Username = your username


Password = your password





#class which supply request Authcation info


Class Terminalpassword (Urllib2. HTTPPASSWORDMGR):


def find_user_password (self, Realm, Authuri):


retval = Urllib2. Httppasswordmgr.find_user_password (self, realm,


Authuri)


If retval[0] = = None and retval[1] = none:


user = Username


passwd = password


Return (user, passwd)


Else


return retval


' It is a multi-thread downloading tool


It was developed follow Axel.


Author:volans


E-MAIL:VOLANSW [at] gmail.com


Modify:gavin MA


Date:2011-04-12


'''


Def Init ():


Try


CJ = Cookielib. Cookiejar ()


Opener = Urllib2.build_opener (urllib2. Httpcookieprocessor (CJ), Urllib2. Httpbasicauthhandler (Terminalpassword ()))


Urllib2.install_opener (opener)


Urllib2.urlopen (URL)


Return True


Except Exception,e:


Pass


Class Axelpython (threading. Thread):


"" Multi-thread downloading class.


Run () is a vitural method of Thread.


'''


def __init__ (self, threadname, url, virus, filename, ranges=0):


Threading. Thread.__init__ (self, name=threadname)


Self.name = ThreadName


Self.url = URL


Self.virus = virus


Self.filename = filename


Self.ranges = Ranges


self.downloaded = 0


Self.flag = False


def run (self):


"' vertual function in Thread '"


Try


self.downloaded = Os.path.getsize (self.filename)


Except OSError:


self.downloaded = 0


# Rebuild Start Poind


Self.startpoint = self.ranges[0] + self.downloaded





# This is completed


If Self.startpoint >= self.ranges[1]:


Print ' Part%s has been downloaded over. '% Self.filename


Return


Self.onetimesize = 8000 #16kByte/time


Try:


Init ()


req = Urllib2. Request (Self.url+self.virus)


Req.add_header ("Range", "bytes=%d-%d"% (Self.startpoint, self.ranges[1))


Urlhandle = Urllib2.urlopen (req)


data = Urlhandle.read (self.onetimesize)


While data:


FileHandle = open (Self.filename, ' ab+ ')


Filehandle.write (data)


Filehandle.close ()


self.downloaded = Len (data)


data = Urlhandle.read (self.onetimesize)


Self.flag = True


Except:


Self.flag = False


Pass








def geturlfilesize (URL, Samplevirus):


If Init ():


req = Urllib2. Request (Url+samplevirus)


Urlhandler = Urllib2.urlopen (req)


headers = Urlhandler.info (). Headers


Length = 0


For header in headers:





If Header.find (' Content-length ')!=-1:


Length = Header.split (': ') [ -1].strip ()


length = Int (length)


return length


def spliteblocks (TotalSize, Blocknumber):


BlockSize = Totalsize/blocknumber


ranges = []


For I in range (0, blocknumber-1):


Ranges.append ((i*blocksize, i*blocksize +blocksize-1))


Ranges.append ((blocksize* (blocknumber-1), totalsize-1)


return ranges


def islive (Tasks):


For task in tasks:


If Task.isalive ():


Return True


Return False


def paxel (URL, samplevirus, Output, blocks=6):


"' Paxel


'''


size = geturlfilesize (URL, Samplevirus)


ran = spliteblocks (size, blocks)


ThreadName = ["thread_%d"% i for I in range (0, blocks)]


filename = [path + os.sep + "tmpfile_%d"% i for I in range (0, blocks)]


tasks = []


For I in range (0, blocks):


Task = Axelpython (threadname[i], url, Samplevirus, filename[i], Ran[i]


Task.setdaemon (True)


Task.start ()


Tasks.append (Task)


Global Finish,count


finish = True


Count = 0


While finish:


For task in tasks:


If not task.isalive ():


Task.run ()


Time.sleep (0.5)


If Task.flag:


Count+=1


Tasks.remove (Task)


# Print Count


if count = = Blocks:


finish = False


# print ' has done '





Time.sleep (2)


FileHandle = open (output, ' wb+ ')


For i in FileName:


f = open (I, ' RB ')


Filehandle.write (F.read ())


F.close ()


Try


Os.remove (i)


Pass


Except


Pass


Filehandle.close ()








def main ():


Paxel (url, download file, output directory, blocks=6)





if __name__ = = ' __main__ ':


Main ()




The display of the average download speed is realized


The Urlretrieve function of the Urllib module is used to know the URL address of the file.
Urllib.urlretrieve (URL, filename)
FileName is the filename to be saved to the local file name. There are 2 optional parameters behind the function, just look at the help document.

Multi-line download, each thread to specify which piece of the file on the download server. Range can be specified in the head in the HTTP protocol.
The URLLIB2 module is used below.

Request = Urllib2. Request (URL)
Request.add_header ("Range", "bytes=%d-%d"% (1024, 2048) #指定下载文件的范围

Opener = Urllib2.build_opener ()
data = Opener.open (Request). Read ()

Now the data inside is the file 1024 bytes to 2048 bytes of content.

Example:

The code is as follows Copy Code
#!/usr/bin/env python


#coding =utf-8


Import re


Import OS


Import Sys


Import time


Import Glob


Import string


Import socket


Import getopt


Import Urllib


Import Urllib2


Import threading


From Sgmllib import Sgmlparser





#############################################################################


#


# self-defined Exception Classes


#


#############################################################################


Class Connectionerror (Exception): Pass


Class Urlunreachable (Exception):p


Class Canotdownload (Exception):p





#############################################################################


#


# Multiple threads Download module starts here


#


#############################################################################


Class Httpgetthread (threading. Thread):


def __init__ (self, name, URL, filename, range=0):


Threading. Thread.__init__ (self, > self.url = URL


Self.filename = filename


Self.range = Range


Self.totallength = range[1]-range[0] +1


Try


self.downloaded = Os.path.getsize (self.filename)


Except OSError:


self.downloaded = 0


Self.percent = Self.downloaded/float (self.totallength) *100


Self.headerrange = (self.range[0]+self.downloaded, self.range[1])


Self.buffersize = 8192








def run (self):


Try


self.downloaded = Os.path.getsize (self.filename)


Except OSError:


self.downloaded = 0


Self.percent = Self.downloaded/float (self.totallength) *100


#self. Headerrange = (self.range[0]+self.downloaded, self.range[1])


Self.buffersize = 8192


#request = Urllib2. Request (Self.url)


#request. Add_header (' Range ', ' bytes=%d-%d '%self.headerrange)


Downloadall = False


Retries = 1


While not downloadall:


If retries > 10:


Break


Try


Self.headerrange = (self.range[0]+self.downloaded, self.range[1])


Request = Urllib2. Request (Self.url)


Request.add_header (' Range ', ' bytes=%d-%d '%self.headerrange)


conn = Urllib2.urlopen (Request)


StartTime = Time.time ()


data = Conn.read (self.buffersize)


While data:


f = open (Self.filename, ' AB ')


F.write (data)


F.close ()


self.time = Int (Time.time ()-StartTime)


self.downloaded = Len (data)


Self.percent = Self.downloaded/float (self.totallength) *100


data = Conn.read (self.buffersize)


Downloadall = True


Except Exception, err:


Retries + 1


Time.sleep (1)


Continue





def Split (size,blocks):


ranges = []


BlockSize = Size/blocks


For I in Xrange (blocks-1):


Ranges.append ((I*blocksize, i*blocksize+blocksize-1))


Ranges.append ((blocksize* (blocks-1), size-1)





return ranges





def gethttpfilesize (URL):


Length = 0


Try


conn = Urllib.urlopen (URL)


headers = Conn.info (). Headers


For header in headers:


If Header.find (' Length ')!=-1:


Length = Header.split (': ') [ -1].strip ()


length = Int (length)


Except Exception, err:


Pass





return length





def haslive (TS):


For T in TS:


If T.isalive ():


Return True


Return False





def myhttpget (URL, Output=none, connections=4):


"""


Arguments


URL, in GBK encoding


Output, default encoding, do no convertion


Connections, Integer


"""


length = gethttpfilesize (URL)


Print length


MB = length/1024/1024.0


If length = = 0:


Raise Urlunreachable


blocks = Connections


If output:


filename = output


Else


Output = Url.split ('/') [-1]


ranges = Split (length, blocks)


names = ["%s_%d"% (output,i) for I in xrange (blocks)]





TS = []


For i in xrange (blocks):


t = httpgetthread (i, URL, names[i], ranges[i])


T.setdaemon (True)


T.start ()


Ts.append (t)





Live = haslive (ts)


startsize = SUM ([t.downloaded for T in TS])


StartTime = Time.time ()


etime = 0


While live:


Try


ETime = Time.time ()-StartTime


d = SUM ([t.downloaded for T in TS])/float (length) *100


Downloadedthistime = SUM ([t.downloaded for T in TS])-startsize


Try


Rate = Downloadedthistime/float (etime)/1024


Except


Rate = 100.0


progressstr = U ' rfilesize:%d (%.2fm) downloaded:%.2f%% Avg rate:%.1fkb/s '% (length, MB, d, rate)


Sys.stdout.write (PROGRESSSTR)


Sys.stdout.flush ()


#sys. Stdout.write (' B ' * (Len (PROGRESSSTR) +1))


Live = haslive (ts)


Time.sleep (0.2)


Except Keyboardinterrupt:


Print


Print "Exit ..."


For N in Names:


Try


Os.remove (N)


Except


Pass


Sys.exit (1)





Print


#print u ' used time:%d:%d, pingjunsudu:%.2fkb/s '% (int (etime)/60, int (etime)%60,rate)





f = open (filename, ' WB ')


For N in Names:


F.write (open (n, ' RB '). Read ())


Try


Os.remove (N)


Except


Pass


F.close ()








if __name__ = = ' __main__ ':


    myhttpget (' http://hi.baidu.com/zjw0358 ', ' hi.html ', 4)

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.