Today, I wrote a download of the Korean Han Sina blog post with Python, the basic functions are as follows:
1. Download articles in bulk from Sina blog and create files by article title
2, the download of the article format.
Known bugs: Lengthy article format will be garbled
1 #!/usr/bin/python2 #-*-coding:utf-8-*-3 4 ImportUrllib5 ImportOS6 ImportRe7 8 defArticle_format (usock,basedir):9title_flag=TrueTencontext_start_flag=True Onecontext_end_flag=True A forLineinchUsock: - ifTitle_flag: -Title=re.findall (R'(<title>.+?<)', line) the ifTitle: -Title=title[0][7:-1] -filename=basedir+title - Printfilename + Try: -Fobj=open (filename,'w+') +Fobj.write (title+'\ n') Atitle_flag=False at exceptioerror,e: - Print "Open%s error:%s"%(filename,e) - Else: - #print "Title has not found,drop it" - Pass - elifContext_start_flag: inResults1=re.findall (R'(<.+. Body begins .+?>)', line) - ifResults1: tocontext_start_flag=False + elifContext_end_flag: -Results2=re.findall (R'(<.+. Text ends. +?)', line) the ifresults2: *context_end_flag=False $Fobj.write ('\nend')Panax Notoginseng fobj.close () - Break the Else: + if 'Div' inchLineor 'span' inchLineor '<p>' inchLine : A Pass the Else: +Line=re.sub ('& #65292;',',', line) -Line=re.sub ('& #65306;',':', line) $Line=re.sub ('& #65281;','!', line) $Line=re.sub ('& #65288;','(', line) -Line=re.sub ('& #65289;',')', line) -Line=re.sub ('& #8943;','...', line) theLine=re.sub ('& #65311;','?', line) -Line=re.sub ('& #65307;',';', line)WuyiLine=re.sub (R'<wbr>',"', line) theLine=re.sub (R' ',"', line) -Line=re.sub (R'<br\s+?/>',"', line) Wu Fobj.write (line) - Else: About Pass $ - if __name__=='__main__': -Basedir='/home/tmyyss/article/' - if notos.path.exists (basedir): A os.makedirs (basedir) + theUsock=urllib.urlopen ("http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html") -context=Usock.read () $ #Print Context theRaw_url_list=re.findall (R'(<a\s+title.+?href= "http.+?html)', context) the forUrlinchraw_url_list: theUrl=re.findall ('(http.+?html)', URL) [0] thearticle_usock=urllib.urlopen (URL) -Article_format (Article_usock,basedir)
View Code
Download Sina blog post, save as text file (python)