python 自動下載 voa MP3

來源:互聯網
上載者:User

因為英語學習的需要,經常會去網上下載一些VOA的MP3,常去的一個網站是http://www.51voa.com/

要想下載該網站上的MP3,需要手動選擇要下載的篇幅,開啟之後再選擇要下載的MP3檔案。要下載單獨一個MP3檔案還好,但要是想把某一時間內的所有MP3檔案都下載下來,就很繁瑣,需要重複做那些無聊的操作。能否用python來做一個下載voa MP3的工具呢?

設計思路如下:

一、開啟http://www.51voa.com/首頁,分析html,解析出首頁上 VOA美國之音聽力最新動向 檔案清單,產生<檔案名稱,檔案>的dictionary

二、對已產生的dictionary按照當前日期過濾,得到能下載的當天的VOA MP3

三、對過濾後的dictionary遍曆,同時進行下載操作

其中所使用的技術:

一、解析html,可以使用standar library中的HTMLParser,或者SGMLParser,也可以選擇3rd party的解析庫,比如BeautifulSoup(對html和xml都能很好的支援),本文採用BeautifulSoup

二、下載MP3,採用urllib,為提高效率,使用多線程進行下載,url header中可以使用Range參數分區下載,這樣一來就能多部分協同操作。

 

具體代碼如下:

一、多線程下載部分代碼

 

#!/usr/bin/env python# -*- coding :utf-8 -*-""" It is a multi-thread downloading tool"""import sysimport osimport timeimport urllib2import urllibfrom threading import Threadclass MyWorkThread(Thread, urllib.FancyURLopener):    """    Multi-thread downloading class.    run() is a vitual method of Thread    """    def __init__(self, threadname, url, filename, ranges = 0):        Thread.__init__(self, name = threadname)        urllib.FancyURLopener.__init__(self)        self.name = threadname        self.url = url        self.filename = filename        self.ranges = ranges        self.downloaded = 0    def run(self):        """        virtual function in Thread        """        try:            self.downloaded = os.path.getsize(self.filename)        except OSError:            self.downloaded = 0        #rebuild start point        self.startpoint = self.ranges[0] + self.downloaded                #if this part is completed        if self.startpoint >= self.ranges[1]:            print 'Part %s has been downloaded over.' % self.filename            return        self.oneTimeSize = 8 * 1024 #8K bytes / time        print 'task %s will download from %d to %d' %(self.name, self.startpoint, self.ranges[1])        self.addheader('Range', 'bytes=%d-%d' %(self.startpoint, self.ranges[1]))        self.urlhandle = self.open(self.url)        data = self.urlhandle.read(self.oneTimeSize)        while data:            filehandle = open(self.filename, 'ab+')            filehandle.write(data)            filehandle.close()            self.downloaded += len(data)            data = self.urlhandle.read(self.oneTimeSize)            def GetUrlFileSize(url):    urlHandler = urllib.urlopen(url)    headers = urlHandler.info().headers    length = 0    for header in headers:        if header.find('Length') != -1:            length = header.split(':')[-1].strip()            length = int(length)    return lengthdef SpliteBlocks(totalsize, blocknumber):    blocksize = totalsize / blocknumber    ranges = []    for i in range(0, blocknumber -1):        ranges.append((i * blocksize, i * blocksize + blocksize -1))    ranges.append((blocksize * (blocknumber -1), totalsize -1))    return rangesdef isLive(tasks):    for task in tasks:        if task.isAlive():            return True    return Falsedef downLoadFile(url, output, blocks = 6):    sys.stdout.write('Begin to download from %s\n' %url )    sys.stdout.flush()    size = GetUrlFileSize(url)    ranges = SpliteBlocks(size, blocks)        threadname = ["thread_%d" %i for i in range(0, blocks)]    filename = ["tmpfile_%d" %i for i in range(0, blocks)]    tasks = []    for i in range(0, blocks):        task = MyWorkThread(threadname[i], url, filename[i], ranges[i])        task.setDaemon(True)        task.start()        tasks.append(task)    time.sleep(2)    while isLive(tasks):        downloaded = sum([task.downloaded for task in tasks])        process = downloaded / float(size) * 100        show = u'\rFilesize: %d Downloaded:%d Completed: %.2f%%' %(size, downloaded, process)        sys.stdout.write(show)        sys.stdout.flush        time.sleep(1)            output = formatFileName(output)    filehandle = open(output, 'wb+')    for i in filename:        f = open(i, 'rb')        filehandle.write(f.read())        f.close()        os.remove(i)    filehandle.close()    sys.stdout.write("Completed!\n")    sys.stdout.flush()        def formatFileName(filename):    if isinstance(filename, str):        header, tail = os.path.split(filename)        if tail != '':            tuple = ('\\','/',':', '*', '?', '"', '<', '>', '|')            for char in tuple:                if tail.find(char) != -1:                    tail = tail.replace(char, '')        filename = os.path.join(header, tail)        #print filename        return filename    else:        return 'None'    if __name__ == '__main__':    url = r'http://www.51voa.com/path.asp?url=/201008/hennessy_africa_wildlife_18aug10-32b.mp3'    output = r"D:\Voa\Study:'Shoot to Kill' Policy in Africa's Parks Abuses Human Rights.mp3"    downLoadFile(url, output, blocks = 4)

 

二、解析voa頁面部分代碼

 

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib2
import chardet
import os
import time
import string
import re
from HTMLParser import HTMLParser
import sys
from BeautifulSoup import BeautifulSoup
import multiThreadDownloadTool


VOA_URL = r'http://www.51voa.com'
DOWNLOAD_DIR = r'D:/Voa'

"""
File downloading from the web.
"""

def getURLContent(url):
"""
get url content of the url, begin with html and ignor the doctype declarations
"""
file = urllib2.urlopen(url)
#print file.info()
data = file.read()
file.close()
#return data.decode('utf-8')
index = data.find('html')
data = data[index - 1 :]
return data
def getVOAURLs(content):
"""
find the voa script urls in the content
"""
urls = {}
soup = BeautifulSoup(content)
divs = soup.findAll('div', {'id':'rightContainer'})
#print divs
neededDiv = None
if len(divs) >= 1:
neededDiv = divs[0]
if neededDiv != None:
#pass the div
#print neededDiv
neededSpan = neededDiv.find('span', {'id' : 'list'})
#print neededSpan
lis = neededSpan.findAll('li')
#print lis
for li in lis:
needAs = li.findAll('a')
#got it
#print needAs[1]
#print needAs[1]['href']
#print needAs[-1].string
urls[needAs[-1].string] = VOA_URL + needAs[-1]['href']
print "getVOAURLs() urls count is " , len(urls)
return urls
def filterbyDate(urls ,date):
"""
filter the urls by date
"""
neededURLs = {}
currentDate = time.localtime(time.time());
#currentDateStr = time.strftime('%Y-%m-%d', currentDate)
#currentDateStr = currentDate.tm_year + '-' + currentDate.tm_mon + ' ' + currentDate.tm_mday
currentDateStr = "%s-%s-%s" %(currentDate.tm_year, currentDate.tm_mon, currentDate.tm_mday)
if(date != None):
currentDateStr = date
for url in urls.keys():
name = url.lstrip().rstrip()
length = len(name)
publishDate = name[- len(currentDateStr) - 1 : -1]
#print publishDate
if publishDate == currentDateStr:
neededURLs[name] = urls[url]
print 'find ', name

print 'After filter, the count is ' , len(neededURLs)
return neededURLs

def findMP3FileInURL(url):
"""
find MP3 files in a url
"""
print 'parse the content of ', url
urls = []
#define a MP3 re string
p = re.compile(r'/path.asp\?url=[-\w/]*\.mp3')
#p = re.compile(r'/[-\w/]*\.mp3')
content = getURLContent(url)
matchLinks = p.findall(content)
#print matchLinks
for link in matchLinks:
tmp = VOA_URL + link
if tmp in urls: # check if exist already
pass
else:
urls.append(tmp)
print 'Current count of mp3 files is ', len(urls)
return urls
def getHTMLFile(url, file_name):
ifile = urllib2.urlopen(url)
content = ifile.read()
local_file = open(file_name, 'w')
local_file.write(content)
local_file.close()

def downloadFile(url, fileName2Store):
"""
download file from url, and store it to local system using fileName2Store parameter
"""
try:
full_path = os.path.join(DOWNLOAD_DIR, fileName2Store)
print 'begin to download url to ', full_path
if os.path.isfile(full_path):
#already exist
print 'the file ', full_path, 'is alreasy exist, so just skip it!'
else:
print '\tDownloading the mp3 file...',
data=urllib2.urlopen(url).read()
print 'Done'
print '\tWriting data info file...',
f=file(full_path, 'wb')
f.write(data)
print 'Done'
f.close()
except Exception, ex:
print 'some exceptions occur when downloading ', ex
if __name__ == "__main__":
try:
#getHTMLFile(VOA_URL, r'.\Voa.html')
context = getURLContent(VOA_URL)
#file_read = open(r'.\Voa.html', 'r')
#context = file_read.read()
#print context
#print '\n' * 5 #print chardet.detect(context) print 'Begin to get download information, it may cost some minuts, please wait...' files2download = getVOAURLs(context) neededDownload = filterbyDate(files2download, None) neededDownloadMp3s = {} for name in neededDownload.keys(): fullURL = neededDownload[name] formatedName = name[: -11].lstrip().rstrip() #formatedName = formatedName.replace(' ', '-') #print formatedName, ' ' * 5, fullURL #print fullURL mp3Names = findMP3FileInURL(fullURL) if len(mp3Names) == 1: #there is only on mp3 file in this file ,so we will use the formatedname neededDownloadMp3s[formatedName] = mp3Names[0] else: for name in mp3Names: print name index_begin = name.rfind('/') index_end = name.rfind('.') tmpName = name[index_begin + 1 : index_end] neededDownloadMp3s[tmpName] = name print 'Now , the mp3 files are :' print neededDownloadMp3s #findMP3FileInURL(r'http://www.51voa.com/VOA_Special_English/Phoning-Fertilizer-Philippine-Rice-Farmers--38545.html') #findMP3FileInURL(r'http://www.51voa.com/Voa_English_Learning/Learn_A_Word_38412.html') #down load file for filename in neededDownloadMp3s.keys(): try: full_path = os.path.join(DOWNLOAD_DIR, filename) full_path = full_path + r'.mp3' if full_path == r'D:\Voa\hennessy_africa_wildlife_18aug10-32b.mp3': multiThreadDownloadTool.downLoadFile(neededDownloadMp3s[filename], full_path) except Exception, ex: print 'Some exceptions occur, when downloading file from %s, exception messages are %s' %(neededDownloadMp3s[filename], ex) #downloadFile(r'http://www.51voa.com/path.asp?url=/201008/mercer_australia_election_16aug10-32b.mp3', 'test.mp3') except Exception, ex: print 'Exception caught, tracebacks are :',sys.exc_info(), ex print 'download all completed!' raw_input("Press any key to continue...")

 

需要注意的地方:

在使用BeautifulSoup進行html解析的時候發現,BeautifulSoup對於

<!DOCTYPE html PUBliC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" />

 的支援不是很好,經常解析不出來,所以為了方便,在解析的時候先將源檔案解析,只將<html></html>之間的資料交與BeautifulSoup解析。具體為什麼BeautifulSoup解析

DOCTYPE出錯,我還沒查出問題所在,希望有知道的朋友告知一聲。


相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.