Python implementation method for bulk download of Sina Blog

Source: Internet
Author: User
This article describes the Python implementation of the bulk Download Sina blog method. Share to everyone for your reference. The implementation method is as follows:

# coding=utf-8 Import urllib2import sys, osimport reimport stringfrom beautifulsoup import beautifulsoupdef encode (s): RE Turn s.decode (' Utf-8 '). Encode (sys.stdout.encoding, ' ignore ') def gethtml (URL): #proxy_handler = Urllib2. Proxyhandler ({' http ': ' http://211.138.124.211:80 '}) #opener = Urllib2.build_opener (Proxy_handler) #urllib2. Install_ Opener (opener) req = Urllib2. Request (URL) response = Urllib2.urlopen (req, timeout=15) return BeautifulSoup (response, convertentities= beautifulsoup.html_entities) def visible (Element): "Grab visible text elements '" If element.parent.name in [' Style ', ' script ', ' [ Document] ', ' head ', ' title ': Return False elif re.match ('
 ', str (element)): return false elif element = = U ' \xa0 ': return false return Truedef Delreturn (Element): ' ' Deletes the element within the NewLine ' ' Return Re.sub (' (?
 "|\xa0", ",", filename) def writetofile (text, filename, dirname): If not os.path.exists (dirname): Os.makedirs (dirname) Print encode (' Save to Directory '), dirname filename = validfilename (filename) print encode (' Save article '), filename path = Os.path.join (dirname, filename) if not os.path.exists (path): F = open (path, ' W ') f.write (text) f.close () else:print fil ename, encode (' already exists ') def formatcontent (URL, title= '): ' Formatted article content ' ' page = gethtml (URL) content = page.find (' div ', {' C Lass ': ' Articalcontent '}) art_id = Re.search (' Blog_ (\w+) \.html ', URL). Group (1) blog_name = Page.find (' span ', id= ' Blognamespan '). String if title = = ": title = Page.find (' H2 ', id=re.compile (' ^t_ ')). String Temp_data = Filter (visible , Content.findall (text=true) # removes the invisible element temp_data = '. Join (Map (Delreturn, Temp_data)) # Remove line breaks within an element temp_data = Temp_data . Strip () # Delete article end-to-end blank line temp_data = Re.sub (' \n{2,} ', ' \ n ', temp_data) # Remove too many blank lines in the article # Output to File # encoding problem Temp_data = ' This address: '. D Ecode (' utf-8 ') + URL + ' \ n ' +Temp_data Op_text = Temp_data.encode (' utf-8 ') Op_file = title + ' _ ' + art_id + '. txt ' writetofile (op_text, op_file, blog   _NAME) def articlelist (URL): articles = {} page = gethtml (URL) pages = Page.find (' ul ', {' class ': ' Sg_pages '}). span.string  page_num = Int (Re.search (' (\d+) ', pages). Group (1)) for I in range (1, page_num+1): Print encode (' generate page%d article index '%i ') if  I! = 1:url = Re.sub (' (_) \d+ (\.html) $ ', ' \g<1> ' +str (i) + ' \g<2> ', url) page = gethtml (URL) article = Page.findall (' span ', {' class ': ' Atc_title '}) for art in article:art_title = art.a[' title '] Art_href = art.    a[' href '] articles[art_title] = art_href return articlesdef blog_dld (articles): If not isinstance (articles, dict): return False print encode (' Start download article ') for Art_title, art_href in Articles.items (): FormatContent (Art_href, Art_title if __name__ = = ' __main__ ': sel = raw_input (' You want to download is (1) all articles or (2) a single article, enter 1 or 2: ')) if sel = = ' 1 ': #articlelist_ur L = ' http://blog.sina.com.cn/s/articlelist_1303481411_0_1.html ' Articlelist_url = raw_input (' Please enter the blog post directory Link: ') ' articles = Articlelist (    Articlelist_url) BLOG_DLD (articles) Else: #article_url = ' http://blog.sina.com.cn/s/blog_4db18c430100gxc5.html ' Article_url = raw_input (Encode (' Please enter blog post link: ') formatcontent (Article_url)

Hopefully this article will help you with Python programming.

  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.