Because of the needs of English learning, often go to the Internet to download some VOA MP3, often go to a website is http://www.51voa.com/
To download the MP3 file on the website, you need to manually select the length of the file to be downloaded, and then select the MP3 file to be downloaded. It's okay to download a single MP3 file, but if you want to download all the MP3 files in a certain period of time, it will be complicated and you need to repeat those boring operations. Can I use python to download voa MP3?
The design concept is as follows:
1. Open http://www.51voa.com/main page, and analyze the HTML. On the main page, the VOA Voice of America listener recently updated the file list to generate a dictionary for <file name, File>.
2. Filter generated tokens based on the current date to obtain the VOA MP3 file that can be downloaded on the current day.
3. traverse the filtered dictionary and perform the download operation at the same time.
Technologies used:
1. parse html. You can use HTMLParser in standar library or SGMLParser, or choose 3rd party's parsing library, such as BeautifulSoup (which is well supported for html and xml). This article uses BeautifulSoup.
2. Use urllib to download MP3 files. To improve efficiency, use multiple threads to download files. You can use the Range parameter to multipart download in the url header, so that you can perform more collaborative operations.
The Code is as follows:
I. multi-threaded download of some code
#!/usr/bin/env python# -*- coding :utf-8 -*-""" It is a multi-thread downloading tool"""import sysimport osimport timeimport urllib2import urllibfrom threading import Threadclass MyWorkThread(Thread, urllib.FancyURLopener): """ Multi-thread downloading class. run() is a vitual method of Thread """ def __init__(self, threadname, url, filename, ranges = 0): Thread.__init__(self, name = threadname) urllib.FancyURLopener.__init__(self) self.name = threadname self.url = url self.filename = filename self.ranges = ranges self.downloaded = 0 def run(self): """ virtual function in Thread """ try: self.downloaded = os.path.getsize(self.filename) except OSError: self.downloaded = 0 #rebuild start point self.startpoint = self.ranges[0] + self.downloaded #if this part is completed if self.startpoint >= self.ranges[1]: print 'Part %s has been downloaded over.' % self.filename return self.oneTimeSize = 8 * 1024 #8K bytes / time print 'task %s will download from %d to %d' %(self.name, self.startpoint, self.ranges[1]) self.addheader('Range', 'bytes=%d-%d' %(self.startpoint, self.ranges[1])) self.urlhandle = self.open(self.url) data = self.urlhandle.read(self.oneTimeSize) while data: filehandle = open(self.filename, 'ab+') filehandle.write(data) filehandle.close() self.downloaded += len(data) data = self.urlhandle.read(self.oneTimeSize) def GetUrlFileSize(url): urlHandler = urllib.urlopen(url) headers = urlHandler.info().headers length = 0 for header in headers: if header.find('Length') != -1: length = header.split(':')[-1].strip() length = int(length) return lengthdef SpliteBlocks(totalsize, blocknumber): blocksize = totalsize / blocknumber ranges = [] for i in range(0, blocknumber -1): ranges.append((i * blocksize, i * blocksize + blocksize -1)) ranges.append((blocksize * (blocknumber -1), totalsize -1)) return rangesdef isLive(tasks): for task in tasks: if task.isAlive(): return True return Falsedef downLoadFile(url, output, blocks = 6): sys.stdout.write('Begin to download from %s\n' %url ) sys.stdout.flush() size = GetUrlFileSize(url) ranges = SpliteBlocks(size, blocks) threadname = ["thread_%d" %i for i in range(0, blocks)] filename = ["tmpfile_%d" %i for i in range(0, blocks)] tasks = [] for i in range(0, blocks): task = MyWorkThread(threadname[i], url, filename[i], ranges[i]) task.setDaemon(True) task.start() tasks.append(task) time.sleep(2) while isLive(tasks): downloaded = sum([task.downloaded for task in tasks]) process = downloaded / float(size) * 100 show = u'\rFilesize: %d Downloaded:%d Completed: %.2f%%' %(size, downloaded, process) sys.stdout.write(show) sys.stdout.flush time.sleep(1) output = formatFileName(output) filehandle = open(output, 'wb+') for i in filename: f = open(i, 'rb') filehandle.write(f.read()) f.close() os.remove(i) filehandle.close() sys.stdout.write("Completed!\n") sys.stdout.flush() def formatFileName(filename): if isinstance(filename, str): header, tail = os.path.split(filename) if tail != '': tuple = ('\\','/',':', '*', '?', '"', '<', '>', '|') for char in tuple: if tail.find(char) != -1: tail = tail.replace(char, '') filename = os.path.join(header, tail) #print filename return filename else: return 'None' if __name__ == '__main__': url = r'http://www.51voa.com/path.asp?url=/201008/hennessy_africa_wildlife_18aug10-32b.mp3' output = r"D:\Voa\Study:'Shoot to Kill' Policy in Africa's Parks Abuses Human Rights.mp3" downLoadFile(url, output, blocks = 4)
Ii. parse part of the voa Page code
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib2
import chardet
import os
import time
import string
import re
from HTMLParser import HTMLParser
import sys
from BeautifulSoup import BeautifulSoup
import multiThreadDownloadTool
VOA_URL = r'http://www.51voa.com'
DOWNLOAD_DIR = r'D:/Voa'
"""
File downloading from the web.
"""
def getURLContent(url):
"""
get url content of the url, begin with html and ignor the doctype declarations
"""
file = urllib2.urlopen(url)
#print file.info()
data = file.read()
file.close()
#return data.decode('utf-8')
index = data.find('html')
data = data[index - 1 :]
return data
def getVOAURLs(content):
"""
find the voa script urls in the content
"""
urls = {}
soup = BeautifulSoup(content)
divs = soup.findAll('div', {'id':'rightContainer'})
#print divs
neededDiv = None
if len(divs) >= 1:
neededDiv = divs[0]
if neededDiv != None:
#pass the div
#print neededDiv
neededSpan = neededDiv.find('span', {'id' : 'list'})
#print neededSpan
lis = neededSpan.findAll('li')
#print lis
for li in lis:
needAs = li.findAll('a')
#got it
#print needAs[1]
#print needAs[1]['href']
#print needAs[-1].string
urls[needAs[-1].string] = VOA_URL + needAs[-1]['href']
print "getVOAURLs() urls count is " , len(urls)
return urls
def filterbyDate(urls ,date):
"""
filter the urls by date
"""
neededURLs = {}
currentDate = time.localtime(time.time());
#currentDateStr = time.strftime('%Y-%m-%d', currentDate)
#currentDateStr = currentDate.tm_year + '-' + currentDate.tm_mon + ' ' + currentDate.tm_mday
currentDateStr = "%s-%s-%s" %(currentDate.tm_year, currentDate.tm_mon, currentDate.tm_mday)
if(date != None):
currentDateStr = date
for url in urls.keys():
name = url.lstrip().rstrip()
length = len(name)
publishDate = name[- len(currentDateStr) - 1 : -1]
#print publishDate
if publishDate == currentDateStr:
neededURLs[name] = urls[url]
print 'find ', name
print 'After filter, the count is ' , len(neededURLs)
return neededURLs
def findMP3FileInURL(url):
"""
find MP3 files in a url
"""
print 'parse the content of ', url
urls = []
#define a MP3 re string
p = re.compile(r'/path.asp\?url=[-\w/]*\.mp3')
#p = re.compile(r'/[-\w/]*\.mp3')
content = getURLContent(url)
matchLinks = p.findall(content)
#print matchLinks
for link in matchLinks:
tmp = VOA_URL + link
if tmp in urls: # check if exist already
pass
else:
urls.append(tmp)
print 'Current count of mp3 files is ', len(urls)
return urls
def getHTMLFile(url, file_name):
ifile = urllib2.urlopen(url)
content = ifile.read()
local_file = open(file_name, 'w')
local_file.write(content)
local_file.close()
def downloadFile(url, fileName2Store):
"""
download file from url, and store it to local system using fileName2Store parameter
"""
try:
full_path = os.path.join(DOWNLOAD_DIR, fileName2Store)
print 'begin to download url to ', full_path
if os.path.isfile(full_path):
#already exist
print 'the file ', full_path, 'is alreasy exist, so just skip it!'
else:
print '\tDownloading the mp3 file...',
data=urllib2.urlopen(url).read()
print 'Done'
print '\tWriting data info file...',
f=file(full_path, 'wb')
f.write(data)
print 'Done'
f.close()
except Exception, ex:
print 'some exceptions occur when downloading ', ex
if __name__ == "__main__":
try:
#getHTMLFile(VOA_URL, r'.\Voa.html')
context = getURLContent(VOA_URL)
#file_read = open(r'.\Voa.html', 'r')
#context = file_read.read()
#print context
#print '\n' * 5 #print chardet.detect(context) print 'Begin to get download information, it may cost some minuts, please wait...' files2download = getVOAURLs(context) neededDownload = filterbyDate(files2download, None) neededDownloadMp3s = {} for name in neededDownload.keys(): fullURL = neededDownload[name] formatedName = name[: -11].lstrip().rstrip() #formatedName = formatedName.replace(' ', '-') #print formatedName, ' ' * 5, fullURL #print fullURL mp3Names = findMP3FileInURL(fullURL) if len(mp3Names) == 1: #there is only on mp3 file in this file ,so we will use the formatedname neededDownloadMp3s[formatedName] = mp3Names[0] else: for name in mp3Names: print name index_begin = name.rfind('/') index_end = name.rfind('.') tmpName = name[index_begin + 1 : index_end] neededDownloadMp3s[tmpName] = name print 'Now , the mp3 files are :' print neededDownloadMp3s #findMP3FileInURL(r'http://www.51voa.com/VOA_Special_English/Phoning-Fertilizer-Philippine-Rice-Farmers--38545.html') #findMP3FileInURL(r'http://www.51voa.com/Voa_English_Learning/Learn_A_Word_38412.html') #down load file for filename in neededDownloadMp3s.keys(): try: full_path = os.path.join(DOWNLOAD_DIR, filename) full_path = full_path + r'.mp3' if full_path == r'D:\Voa\hennessy_africa_wildlife_18aug10-32b.mp3': multiThreadDownloadTool.downLoadFile(neededDownloadMp3s[filename], full_path) except Exception, ex: print 'Some exceptions occur, when downloading file from %s, exception messages are %s' %(neededDownloadMp3s[filename], ex) #downloadFile(r'http://www.51voa.com/path.asp?url=/201008/mercer_australia_election_16aug10-32b.mp3', 'test.mp3') except Exception, ex: print 'Exception caught, tracebacks are :',sys.exc_info(), ex print 'download all completed!' raw_input("Press any key to continue...")
Note:
When BeautifulSoup is used for html parsing, BeautifulSoup
<! DOCTYPE html PUBliC "-// W3C // dtd xhtml 1.0 Transitional // EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"/>The support is not very good and often cannot be parsed, so for convenience, the source file is parsed first, and only the data between
DOCTYPE error. I haven't found the problem. I hope you can tell me.