Today I took a look at some of the multiple threads download files that I just learned when I was a python.
To achieve a simple multithreaded download, you need to focus on the following points:
1. File size: Can be extracted from the reponse header, such as "content-length:911" indicates that the size is 911 bytes
2. Task split: Specifies which pieces of files are downloaded by each thread, and you can add "range:bytes=300-400" to the request header (which means the content of the download 300~400byte), and note that the Range of files that can be requested is [0, size-1 Bytes.
3. Aggregation of downloaded files: Each thread saves its own downloaded block of files as a temporary file, all threads are completed, and then the temporary files are aggregated into the final file in order.
On the internet to see a good multithreaded Download file example, is based on the Linux multithreaded download tool Axel thought to write, the source code in: http://fayaa.com/code/view/58/full/
I've included it, and the following needs can be modified based on it, and the sample code is as follows:
https://github.com/smilejay/python/blob/master/py2014/paxel.py
The code is as follows |
Copy Code |
#!/usr/bin/python
#-*-Coding:utf-8-*-
# filename:paxel.py
# from:http://fayaa.com/code/view/58/full/
# Jay modified it a little and save for further potential usage.
' It is a multi-thread downloading tool
It was developed following Axel.
Author:volans
E-MAIL:VOLANSW [at] gmail.com
'''
Import Sys
Import OS
Import time
Import Urllib
From threading Import Thread
# in the case for your want to use Http_proxy
local_proxies = {' http ': ' http://131.139.58.200:8080 '}
Class Axelpython (Thread, Urllib. Fancyurlopener):
"" Multi-thread downloading class.
Run () is a vitural method of Thread.
'''
def __init__ (self, threadname, url, filename, ranges=0, proxies={}):
Thread.__init__ (self, name=threadname)
Urllib. Fancyurlopener.__init__ (self, proxies)
Self.name = ThreadName
Self.url = URL
Self.filename = filename
Self.ranges = Ranges
self.downloaded = 0
def run (self):
"' vertual function in Thread '"
Try
self.downloaded = Os.path.getsize (self.filename)
Except OSError:
#print ' never downloaded '
self.downloaded = 0
# Rebuild Start Poind
Self.startpoint = self.ranges[0] + self.downloaded
# This is completed
If Self.startpoint >= self.ranges[1]:
Print ' Part%s has been downloaded over. '% Self.filename
Return
Self.onetimesize = 16384 # 16kbyte/time
print ' task%s ' download from%d to%d '% (Self.name, Self.startpoint, self.ranges[1])
Self.addheader ("Range", "bytes=%d-%d"% (Self.startpoint, self.ranges[1))
Self.urlhandle = Self.open (Self.url)
data = Self.urlhandle.read (self.onetimesize)
While data:
FileHandle = open (Self.filename, ' ab+ ')
Filehandle.write (data)
Filehandle.close ()
self.downloaded = Len (data)
#print '%s '% (self.name)
#progress = U ' r ... '
data = Self.urlhandle.read (self.onetimesize)
def geturlfilesize (URL, proxies={}):
Urlhandler = Urllib.urlopen (URL, proxies=proxies)
headers = Urlhandler.info (). Headers
Length = 0
For header in headers:
If Header.find (' Length ')!=-1:
Length = Header.split (': ') [ -1].strip ()
length = Int (length)
return length
def spliteblocks (TotalSize, Blocknumber):
BlockSize = Totalsize/blocknumber
ranges = []
For I in range (0, blocknumber-1):
Ranges.append ((i * blocksize, I * blocksize + blocksize-1))
Ranges.append ((BlockSize * (blocknumber-1), totalsize-1))
return ranges
def islive (Tasks):
For task in tasks:
If Task.isalive ():
Return True
Return False
def paxel (URL, output, blocks=6, proxies=local_proxies):
"' Paxel
'''
size = geturlfilesize (URL, proxies)
Ranges = spliteblocks (size, blocks)
ThreadName = ["thread_%d"% i for I in range (0, blocks)]
filename = ["tmpfile_%d"% i for I in range (0, blocks)]
tasks = []
For I in range (0, blocks):
Task = Axelpython (threadname[i], URL, filename[i], ranges[i])
Task.setdaemon (True)
Task.start ()
Tasks.append (Task)
Time.sleep (2)
While islive (tasks):
downloaded = SUM ([task.downloaded for task in tasks])
Process = downloaded/float (size) * 100
Show = U ' rfilesize:%d downloaded:%d completed:%.2f%% '% (size, downloaded, process)
Sys.stdout.write (show)
Sys.stdout.flush ()
Time.sleep (0.5)
FileHandle = open (output, ' wb+ ')
For i in FileName:
f = open (I, ' RB ')
Filehandle.write (F.read ())
F.close ()
Try
Os.remove (i)
Pass
Except
Pass
Filehandle.close ()
if __name__ = = ' __main__ ':
url = ' HTTP://DLDIR1.QQ.COM/QQFILE/QQFORMAC/QQ_V3.1.1.DMG '
Output = ' download.file '
Paxel (URL, output, blocks=4, proxies={}) |
HTTPS user name password Authentication added:
Multi-threaded downloads are considered when you need to download HTTPS with SSL-authenticated files and often have a single process card dead
The code is as follows |
Copy Code |
#-*-Coding:utf8-*-
Import OS
Import Getpass,urllib2,sys
Import Cookielib
Import Threading,thread
URL = Your Https://url
Username = your username
Password = your password
#class which supply request Authcation info
Class Terminalpassword (Urllib2. HTTPPASSWORDMGR):
def find_user_password (self, Realm, Authuri):
retval = Urllib2. Httppasswordmgr.find_user_password (self, realm,
Authuri)
If retval[0] = = None and retval[1] = none:
user = Username
passwd = password
Return (user, passwd)
Else
return retval
' It is a multi-thread downloading tool
It was developed follow Axel.
Author:volans
E-MAIL:VOLANSW [at] gmail.com
Modify:gavin MA
Date:2011-04-12
'''
Def Init ():
Try
CJ = Cookielib. Cookiejar ()
Opener = Urllib2.build_opener (urllib2. Httpcookieprocessor (CJ), Urllib2. Httpbasicauthhandler (Terminalpassword ()))
Urllib2.install_opener (opener)
Urllib2.urlopen (URL)
Return True
Except Exception,e:
Pass
Class Axelpython (threading. Thread):
"" Multi-thread downloading class.
Run () is a vitural method of Thread.
'''
def __init__ (self, threadname, url, virus, filename, ranges=0):
Threading. Thread.__init__ (self, name=threadname)
Self.name = ThreadName
Self.url = URL
Self.virus = virus
Self.filename = filename
Self.ranges = Ranges
self.downloaded = 0
Self.flag = False
def run (self):
"' vertual function in Thread '"
Try
self.downloaded = Os.path.getsize (self.filename)
Except OSError:
self.downloaded = 0
# Rebuild Start Poind
Self.startpoint = self.ranges[0] + self.downloaded
# This is completed
If Self.startpoint >= self.ranges[1]:
Print ' Part%s has been downloaded over. '% Self.filename
Return
Self.onetimesize = 8000 #16kByte/time
Try:
Init ()
req = Urllib2. Request (Self.url+self.virus)
Req.add_header ("Range", "bytes=%d-%d"% (Self.startpoint, self.ranges[1))
Urlhandle = Urllib2.urlopen (req)
data = Urlhandle.read (self.onetimesize)
While data:
FileHandle = open (Self.filename, ' ab+ ')
Filehandle.write (data)
Filehandle.close ()
self.downloaded = Len (data)
data = Urlhandle.read (self.onetimesize)
Self.flag = True
Except:
Self.flag = False
Pass
def geturlfilesize (URL, Samplevirus):
If Init ():
req = Urllib2. Request (Url+samplevirus)
Urlhandler = Urllib2.urlopen (req)
headers = Urlhandler.info (). Headers
Length = 0
For header in headers:
If Header.find (' Content-length ')!=-1:
Length = Header.split (': ') [ -1].strip ()
length = Int (length)
return length
def spliteblocks (TotalSize, Blocknumber):
BlockSize = Totalsize/blocknumber
ranges = []
For I in range (0, blocknumber-1):
Ranges.append ((i*blocksize, i*blocksize +blocksize-1))
Ranges.append ((blocksize* (blocknumber-1), totalsize-1)
return ranges
def islive (Tasks):
For task in tasks:
If Task.isalive ():
Return True
Return False
def paxel (URL, samplevirus, Output, blocks=6):
"' Paxel
'''
size = geturlfilesize (URL, Samplevirus)
ran = spliteblocks (size, blocks)
ThreadName = ["thread_%d"% i for I in range (0, blocks)]
filename = [path + os.sep + "tmpfile_%d"% i for I in range (0, blocks)]
tasks = []
For I in range (0, blocks):
Task = Axelpython (threadname[i], url, Samplevirus, filename[i], Ran[i]
Task.setdaemon (True)
Task.start ()
Tasks.append (Task)
Global Finish,count
finish = True
Count = 0
While finish:
For task in tasks:
If not task.isalive ():
Task.run ()
Time.sleep (0.5)
If Task.flag:
Count+=1
Tasks.remove (Task)
# Print Count
if count = = Blocks:
finish = False
# print ' has done '
Time.sleep (2)
FileHandle = open (output, ' wb+ ')
For i in FileName:
f = open (I, ' RB ')
Filehandle.write (F.read ())
F.close ()
Try
Os.remove (i)
Pass
Except
Pass
Filehandle.close ()
def main ():
Paxel (url, download file, output directory, blocks=6)
if __name__ = = ' __main__ ':
Main () |
The display of the average download speed is realized
The Urlretrieve function of the Urllib module is used to know the URL address of the file.
Urllib.urlretrieve (URL, filename)
FileName is the filename to be saved to the local file name. There are 2 optional parameters behind the function, just look at the help document.
Multi-line download, each thread to specify which piece of the file on the download server. Range can be specified in the head in the HTTP protocol.
The URLLIB2 module is used below.
Request = Urllib2. Request (URL)
Request.add_header ("Range", "bytes=%d-%d"% (1024, 2048) #指定下载文件的范围
Opener = Urllib2.build_opener ()
data = Opener.open (Request). Read ()
Now the data inside is the file 1024 bytes to 2048 bytes of content.
Example:
The code is as follows |
Copy Code |
#!/usr/bin/env python
#coding =utf-8
Import re
Import OS
Import Sys
Import time
Import Glob
Import string
Import socket
Import getopt
Import Urllib
Import Urllib2
Import threading
From Sgmllib import Sgmlparser
#############################################################################
#
# self-defined Exception Classes
#
#############################################################################
Class Connectionerror (Exception): Pass
Class Urlunreachable (Exception):p
Class Canotdownload (Exception):p
#############################################################################
#
# Multiple threads Download module starts here
#
#############################################################################
Class Httpgetthread (threading. Thread):
def __init__ (self, name, URL, filename, range=0):
Threading. Thread.__init__ (self, > self.url = URL
Self.filename = filename
Self.range = Range
Self.totallength = range[1]-range[0] +1
Try
self.downloaded = Os.path.getsize (self.filename)
Except OSError:
self.downloaded = 0
Self.percent = Self.downloaded/float (self.totallength) *100
Self.headerrange = (self.range[0]+self.downloaded, self.range[1])
Self.buffersize = 8192
def run (self):
Try
self.downloaded = Os.path.getsize (self.filename)
Except OSError:
self.downloaded = 0
Self.percent = Self.downloaded/float (self.totallength) *100
#self. Headerrange = (self.range[0]+self.downloaded, self.range[1])
Self.buffersize = 8192
#request = Urllib2. Request (Self.url)
#request. Add_header (' Range ', ' bytes=%d-%d '%self.headerrange)
Downloadall = False
Retries = 1
While not downloadall:
If retries > 10:
Break
Try
Self.headerrange = (self.range[0]+self.downloaded, self.range[1])
Request = Urllib2. Request (Self.url)
Request.add_header (' Range ', ' bytes=%d-%d '%self.headerrange)
conn = Urllib2.urlopen (Request)
StartTime = Time.time ()
data = Conn.read (self.buffersize)
While data:
f = open (Self.filename, ' AB ')
F.write (data)
F.close ()
self.time = Int (Time.time ()-StartTime)
self.downloaded = Len (data)
Self.percent = Self.downloaded/float (self.totallength) *100
data = Conn.read (self.buffersize)
Downloadall = True
Except Exception, err:
Retries + 1
Time.sleep (1)
Continue
def Split (size,blocks):
ranges = []
BlockSize = Size/blocks
For I in Xrange (blocks-1):
Ranges.append ((I*blocksize, i*blocksize+blocksize-1))
Ranges.append ((blocksize* (blocks-1), size-1)
return ranges
def gethttpfilesize (URL):
Length = 0
Try
conn = Urllib.urlopen (URL)
headers = Conn.info (). Headers
For header in headers:
If Header.find (' Length ')!=-1:
Length = Header.split (': ') [ -1].strip ()
length = Int (length)
Except Exception, err:
Pass
return length
def haslive (TS):
For T in TS:
If T.isalive ():
Return True
Return False
def myhttpget (URL, Output=none, connections=4):
"""
Arguments
URL, in GBK encoding
Output, default encoding, do no convertion
Connections, Integer
"""
length = gethttpfilesize (URL)
Print length
MB = length/1024/1024.0
If length = = 0:
Raise Urlunreachable
blocks = Connections
If output:
filename = output
Else
Output = Url.split ('/') [-1]
ranges = Split (length, blocks)
names = ["%s_%d"% (output,i) for I in xrange (blocks)]
TS = []
For i in xrange (blocks):
t = httpgetthread (i, URL, names[i], ranges[i])
T.setdaemon (True)
T.start ()
Ts.append (t)
Live = haslive (ts)
startsize = SUM ([t.downloaded for T in TS])
StartTime = Time.time ()
etime = 0
While live:
Try
ETime = Time.time ()-StartTime
d = SUM ([t.downloaded for T in TS])/float (length) *100
Downloadedthistime = SUM ([t.downloaded for T in TS])-startsize
Try
Rate = Downloadedthistime/float (etime)/1024
Except
Rate = 100.0
progressstr = U ' rfilesize:%d (%.2fm) downloaded:%.2f%% Avg rate:%.1fkb/s '% (length, MB, d, rate)
Sys.stdout.write (PROGRESSSTR)
Sys.stdout.flush ()
#sys. Stdout.write (' B ' * (Len (PROGRESSSTR) +1))
Live = haslive (ts)
Time.sleep (0.2)
Except Keyboardinterrupt:
Print
Print "Exit ..."
For N in Names:
Try
Os.remove (N)
Except
Pass
Sys.exit (1)
Print
#print u ' used time:%d:%d, pingjunsudu:%.2fkb/s '% (int (etime)/60, int (etime)%60,rate)
f = open (filename, ' WB ')
For N in Names:
F.write (open (n, ' RB '). Read ())
Try
Os.remove (N)
Except
Pass
F.close ()
if __name__ = = ' __main__ ':
myhttpget (' http://hi.baidu.com/zjw0358 ', ' hi.html ', 4) |