In principle, CSDN does not allow unauthorized access. Normal crawlers cannot crawl articles from here. Therefore, users must simulate artificial access to the browser.
Use: Enter the CSDN link with the document to automatically generate the HTML of the body, and the file name is the title name.
#! /Usr/bin/env python # coding = UTF-8 ############################# #############> File Name: CSDN_article.py #> Author: nealgavin #> Mail: nealgavin@126.com #> Created Time: tue 27 May 2014 03:42:54 pm cst ################################# ######## import randomimport socketimport urllib2import urllibimport reimport stringimport BeautifulSoupimport sysimport cookielibERROR = {'0 ': 'Can not open the url, checc K you net', '1': 'creat download dir error', '2': 'image links is empty', '3': 'Download faild ', '4': 'build soup error, the html is empty ', '5': 'Can not save the image to your disk',} class BrowserBase (object ): "simulate Browser" def _ init _ (self): socket. setdefatimetimeout (20) self. HTML = ''self. articleName = ''self. link = ''def speak (self, name, content): print '[% s] % s' % (name, content) def openurl (sel F, url): "Open webpage" cookie_support = urllib2.HTTPCookieProcessor (cookielib. cookieJar () self. opener = urllib2.build _ opener (cookie_support, urllib2.HTTPHandler) urllib2.install _ opener (self. opener) user_agents = ['mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv: 1.8.1.11) Gecko/20071127 Firefox/2.0.0.11 ', 'Opera/9.25 (Windows NT 5.1; U; en) ', 'mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1 ;. Net clr 1.1.4322 ;. net clr 2.0.50727) ', 'mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'mozilla/5.0 (X11; U; linux i686; en-US; rv: 1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12 ', 'lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9 ', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) ubuntu/11.04 Chromium/16.0.912.77 C Hrome/16.0.912.77 Safari/535.7 "," Mozilla/5.0 (X11; Ubuntu; Linux i686; rv: 10.0) Gecko/20100101 Firefox/10.0 ",] agent = random. choice (user_agents) self. opener. addheaders = [("User-agent", agent), ("Accept", "*/*"), ('Referer', 'HTTP: // www.google.com ')] try: res = self. opener. open (url) self. HTML = res. read () # print res. read () failed t Exception, e: self. speak (str (e), url) raise Exception else: return res Def OUT (self): print self. HTML def getArticleName (self, tags): re_rules = r'http: // blog.csdn.net/nealgavin/article/details/ (. + ?) 'P = re. compile (re_rules, re. DOTALL) title = p. findall (str (tags) self. chineseListOut (title) def chineseListOut (self, tags): title = [] for tag in tags: for ele in tag: # print '+', str (ele ), '-'title. append (ele. strip () self. link = "http://blog.csdn.net" + title [1] tle = title [2]. split () self. articleName = '-'. join (tle) def buildArticleHTML (self): self. HTML = str (self. HTML) self. HTML ='
'+ Self. HTML self. HTML = self. HTML +''Def getMainArticle (self): "get the main article of CSDN blog" soup = BeautifulSoup. beautifulSoup (self. HTML) tags_all = soup. findAll ('div ', {'class': 'Article _ title'}) self. getArticleName (tags_all) tags_all = soup. findAll ('div ', {'id': 'Article _ content', 'class': 'Article _ content'}) self. HTML = tags_all [0] self. buildArticleHTML () def saveArticleAsHTML (self): filePath = self.articleName+'.html 'try: FilePointer = open (filePath, 'W + ') failed T: print 'open error' print 'path =', filePath filePointer. write (self. HTML) filePointer. close () browser = BrowserBase () url = raw_input ('input the links of CSDN article you needed! \ N') if url is None or len (url) = 0: url = "http://blog.csdn.net/nealgavin/article/details/27110717" browser. openurl (url) browser. getMainArticle () browser. saveArticleAsHTML ()