# Coding = UTF-8 Import urllib2 Import sys, OS Import re Import string From BeautifulSoup import BeautifulSoup Def encode (s ): Return s. decode ('utf-8'). encode (sys. stdout. encoding, 'ignore ') Def getHTML (url ): # Proxy_handler = urllib2.ProxyHandler ({'http': 'http: // 211.138.124.211: 80 '}) # Opener = urllib2.build _ opener (proxy_handler) # Urllib2.install _ opener (opener) Req = urllib2.Request (url) Response = urllib2.urlopen (req, timeout = 15) Return BeautifulSoup (response, convertEntities = BeautifulSoup. HTML_ENTITIES) Def visible (element ): '''Capture visible text elemental ''' If element. parent. name in ['style', 'script', '[document]', 'head', 'title']: Return False Elif re. match ('<! --. * --> ', Str (element )): Return False Elif element = U' \ xa0 ': Return False Return True Def delReturn (element ): '''Line breaks in the deleted element ''' Return re. sub ('(? <! ^) \ N + (?! $) ', '', Str (element). decode ('utf-8 ') Def validFilename (filename ): # Windows Return re. sub ('[\/:*? <> "| \ Xa0] ','', filename) Def writeToFile (text, filename, dirname ): If not OS. path. exists (dirname ): OS. makedirs (dirname) Print encode ('Save to directory'), dirname Filename = validFilename (filename) Print encode ('Save filename '), filename Path = OS. path. join (dirname, filename) If not OS. path. exists (path ): F = open (path, 'w ') F. write (text) F. close () Else: Print filename, encode ('already exists ') Def formatContent (url, title = ''): '''Format the article content ''' Page = getHTML (url) Content = page. find ('div ', {'class': 'articalcontent '}) Art_id = re. search ('blog _ (\ w +) \. html ', url). group (1) Blog_name = page. find ('span ', id = 'blognamespan'). string If title = '': Title = page. find ('h2 ', id = re. compile (' ^ t _ '). string Temp_data = filter (visible, content. findAll (text = True) # Remove invisible elements Temp_data = ''. join (map (delReturn, temp_data) # Delete the linefeed in the element Temp_data = temp_data.strip () # Delete the empty line at the beginning and end of the article Temp_data = re. sub ('\ n {2,}', '\ n \ n', temp_data) # Delete too many blank lines in the article # Output to file # Coding problems Temp_data = 'address of this article: '. decode ('utf-8') + url +' \ n \ n' + temp_data Op_text = temp_data.encode ('utf-8 ') Op_file = title + '_' + art_id pai'.txt' WriteToFile (op_text, op_file, blog_name) Def articlelist (url ): Articles = {} Page = getHTML (url) Pages = page. find ('ul ', {'class': 'sg _ page'}). span. string Page_num = int (re. search ('(\ d +)', pages). group (1 )) For I in range (1, page_num + 1 ): Print encode ('generate the index on Page % d '% I) If I! = 1: Url = re. sub ('(_) \ d + (\. html) $ ',' \ g <1> '+ str (I) +' \ g <2> ', url) Page = getHTML (url) Article = page. findAll ('span ', {'class': 'atc _ title '}) For art in article: Art_title = art. a ['title'] Art_href = art. a ['href '] Articles [art_title] = art_href Return articles Def blog_dld (articles ): If not isinstance (articles, dict ): Return False Print encode ('start downloading ') For art_title, art_href in articles. items (): FormatContent (art_href, art_title) If _ name _ = '_ main __': Sel = raw_input (encode ('Do you want to download (1) all articles or (2) single article, input 1 or 2 :')) If sel = '1 ': # Articlelist_url = 'HTTP: // blog.sina.com.cn/s/articlelist_1303481411_0_1.html' Articlelist_url = raw_input (encode ('Enter the blog article directory link :')) Articles = articlelist (articlelist_url) Blog_dld (articles) Else: # Article_url = 'HTTP: // blog.sina.com.cn/s/blog_4db18c430100gxc5.html' Article_url = raw_input (encode ('Enter the blog post link :')) FormatContent (article_url) |