最近寫了個豆瓣個人電台自動下載加心歌曲的小程式,基本能夠下載,但需要手動將"http://douban.fm/mine?type=liked"頁面全都下載下來,有點蛋疼,- - !!。由於還沒有實現程式登入豆瓣的功能,暫時先這樣用吧。
#!/usr/bin/python2.7# -*- coding:utf -*-import urllibimport reimport socketimport cookielibimport urllib2socket.setdefaulttimeout(1)def getpag(url):done = Falsetry:response = urllib.urlopen(url)re = response.read()done = Trueexcept Exception as e:print "error in getpag({0})".format(url)if done: return re else: return ""def removehtml(s):p = re.compile(r'(<.*?>)|(&.*?;)', re.S)return p.sub("", s)def removeotherword(s):p = re.compile(r'((([\(\[{])|(\xef\xbc\x88)).*?(([\)\]}])|(\xef\xbc\x89)))|(^\s+)|(\s+$)')return p.sub("", s)# login douban & sv cookie# tododef logindouban():loginurl = "http://www.douban.com/accounts/login"data = urllib.urlencode({'source':'simple','form_email':'vodmaker@gmail.com','form_password':'xxx','remember':'on',})print datacj = cookielib.CookieJar()opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))urllib2.install_opener(opener)opener.addheaders = [("User-agent", "Mozilla/5.0 (X11; Linux i686; rv:2.0.1) Gecko/20100101 Firefox/4.0.1")]req = urllib2.Request(loginurl, data)print urllib2.urlopen(req).geturl() ## s1 match exactly to s2def matchexact(s1, s2):ps1 = re.compile(s1, re.I)if ps1.search(s2) is not None:return Trueelse:return False# s1 match to s2def matchmost(s1, s2):s1 = re.compile(r'\s').sub("\\s*", s1)ps1 = re.compile(s1, re.I)if ps1.search(s2) is not None:return Trueelse:return False# [[name, artist], ...] per list_pagdef getmusiclist_perpag(list_pag):ptable = re.compile(r'<table\s*class="olts"\s*width="100%">.*?</table>', re.S)htable = ptable.search(list_pag).group()ptbody = re.compile(r'<tbody>.*?</tbody>', re.S)htbody = ptbody.search(htable).group()ptr = re.compile(r'<tr>\s*<td>(.*?)</td>.*?<span>(.*?)</span>.*?</tr>', re.S)musiclist = []for m in ptr.finditer(htbody):print "music:" + removeotherword(m.group(1)) + "artist:" + removeotherword(m.group(2))musiclist.append([removeotherword(m.group(1)), removeotherword(m.group(2))])return musiclist# download music from mp3.yahoo.com# parameter muscilist [[name, artist], ...]def downloadfromyahoo(musiclist):listurl = "http://music.yahoo.cn/s?q={0}&m=0"for ma in musiclist:music = ma[0]artist = ma[1]print "Music:\t" + music + "\tArtist:\t" + artist + "is Downloading..."u = listurl.format(urllib.quote_plus(music))listpag = getpag(u)ptable = re.compile(r'<div class="yst-music">.*?</table>', re.S)if ptable.search(listpag) is None:print "No search result of {0} in yahoo.cn".format(music)continuehtable = ptable.search(listpag).group()ptr = re.compile(r'<tr>\s*<td class="m_song">\s*<a href=".*?url=(.*?)"' r'.*?>(.*?)</a>' r'.*?<td class="m_singer">.*?>(.*?)</a>' r'.*?<td.*?<td>(.*?)</td>' r'.*?<td>(.*?)[mM][bB]' r'.*?</tr>' , re.S);find = Falsefor m in ptr.finditer(htable):downurl = urllib.unquote(m.group(1))music_t = removeotherword(removehtml(m.group(2)))artist_t = removeotherword(removehtml(m.group(3)))type_t = removeotherword(removehtml(m.group(4)))size_t = removeotherword(removehtml(m.group(5)))if matchexact(music_t, music) and matchexact(artist_t, artist) and float(size_t) > 2:print "download from :" + downurl + ""try:music_stream = urllib.urlopen(downurl).read()open("./down/"+music+"."+type_t, "wb").write(music_stream)find = Trueprint "download success: music:{0}, artist:{1}".format(music, artist)breakexcept Exception as e:continueprint eif not find:for m in ptr.finditer(htable):downurl = urllib.unquote(m.group(1))music_t = removeotherword(removehtml(m.group(2)))artist_t = removeotherword(removehtml(m.group(3)))type_t = removeotherword(removehtml(m.group(4)))size_t = removeotherword(removehtml(m.group(5)))if matchmost(music_t, music) and matchmost(artist_t, artist) and float(size_t) > 1:print "download from :" + downurl + ""try:music_stream = urllib.urlopen(downurl).read()open("./down/"+music+"."+type_t, "wb").write(music_stream)find = Trueprint "download success: music:{0}, artist:{1}".format(music, artist)breakexcept Exception as e:continueprint eif not find:print "download failed: music:{0}, artist:{1}".format(music, artist) # end downloadfromyahoo funcmusiclist = []for i in range(17):f = open("{0}.html".format(i), "r")listp = f.read()musiclist += getmusiclist_perpag(listp)downloadfromyahoo(musiclist)
貌似豆瓣有屏蔽程式訪問頁面的措施,目前仍糾結於如何?登入的部分,不能保證一定能解決登入豆瓣自動抓取加心頁面的功能,程式更新期限未知。