This article describes the Python implementation of the bulk Download Sina blog method. Share to everyone for your reference. The implementation method is as follows:
# coding=utf-8 Import urllib2import sys, osimport reimport stringfrom beautifulsoup import beautifulsoupdef encode (s): RE Turn s.decode (' Utf-8 '). Encode (sys.stdout.encoding, ' ignore ') def gethtml (URL): #proxy_handler = Urllib2. Proxyhandler ({' http ': ' http://211.138.124.211:80 '}) #opener = Urllib2.build_opener (Proxy_handler) #urllib2. Install_ Opener (opener) req = Urllib2. Request (URL) response = Urllib2.urlopen (req, timeout=15) return BeautifulSoup (response, convertentities= beautifulsoup.html_entities) def visible (Element): "Grab visible text elements '" If element.parent.name in [' Style ', ' script ', ' [ Document] ', ' head ', ' title ': Return False elif re.match ('
', str (element)): return false elif element = = U ' \xa0 ': return false return Truedef Delreturn (Element): ' ' Deletes the element within the NewLine ' ' Return Re.sub (' (?
"|\xa0", ",", filename) def writetofile (text, filename, dirname): If not os.path.exists (dirname): Os.makedirs (dirname) Print encode (' Save to Directory '), dirname filename = validfilename (filename) print encode (' Save article '), filename path = Os.path.join (dirname, filename) if not os.path.exists (path): F = open (path, ' W ') f.write (text) f.close () else:print fil ename, encode (' already exists ') def formatcontent (URL, title= '): ' Formatted article content ' ' page = gethtml (URL) content = page.find (' div ', {' C Lass ': ' Articalcontent '}) art_id = Re.search (' Blog_ (\w+) \.html ', URL). Group (1) blog_name = Page.find (' span ', id= ' Blognamespan '). String if title = = ": title = Page.find (' H2 ', id=re.compile (' ^t_ ')). String Temp_data = Filter (visible , Content.findall (text=true) # removes the invisible element temp_data = '. Join (Map (Delreturn, Temp_data)) # Remove line breaks within an element temp_data = Temp_data . Strip () # Delete article end-to-end blank line temp_data = Re.sub (' \n{2,} ', ' \ n ', temp_data) # Remove too many blank lines in the article # Output to File # encoding problem Temp_data = ' This address: '. D Ecode (' utf-8 ') + URL + ' \ n ' +Temp_data Op_text = Temp_data.encode (' utf-8 ') Op_file = title + ' _ ' + art_id + '. txt ' writetofile (op_text, op_file, blog _NAME) def articlelist (URL): articles = {} page = gethtml (URL) pages = Page.find (' ul ', {' class ': ' Sg_pages '}). span.string page_num = Int (Re.search (' (\d+) ', pages). Group (1)) for I in range (1, page_num+1): Print encode (' generate page%d article index '%i ') if I! = 1:url = Re.sub (' (_) \d+ (\.html) $ ', ' \g<1> ' +str (i) + ' \g<2> ', url) page = gethtml (URL) article = Page.findall (' span ', {' class ': ' Atc_title '}) for art in article:art_title = art.a[' title '] Art_href = art. a[' href '] articles[art_title] = art_href return articlesdef blog_dld (articles): If not isinstance (articles, dict): return False print encode (' Start download article ') for Art_title, art_href in Articles.items (): FormatContent (Art_href, Art_title if __name__ = = ' __main__ ': sel = raw_input (' You want to download is (1) all articles or (2) a single article, enter 1 or 2: ')) if sel = = ' 1 ': #articlelist_ur L = ' http://blog.sina.com.cn/s/articlelist_1303481411_0_1.html ' Articlelist_url = raw_input (' Please enter the blog post directory Link: ') ' articles = Articlelist ( Articlelist_url) BLOG_DLD (articles) Else: #article_url = ' http://blog.sina.com.cn/s/blog_4db18c430100gxc5.html ' Article_url = raw_input (Encode (' Please enter blog post link: ') formatcontent (Article_url)
Hopefully this article will help you with Python programming.