OSC RSS is not full-text output, not happy, so there is a Python crawl OSC latest Blog generated RSS
#-*-Coding:utf-8-*-from bs4 import beautifulsoupimport urllib2import datetimeimport timeimport PyRSS2Genfrom email. Utils import formatdateimport reimport sysimport osreload (SYS) sys.setdefaultencoding (' Utf-8 ') class Rssspider (): Def __ Init__ (self): Self.myrss = Pyrss2gen.rss2 (title= ' Oschina ', link= ' http://my.oschina.net ', Description=str ( Datetime.date.today ()), Pubdate=datetime.datetime.now (), lastbuilddate = Datetime.datetime.now (), items=[]) Self.xmlpath=r '/var/www/myrss/oschina.xml ' self.baseurl= "Http://www.oschina.net/blog" #if os.path.isfile ( Self.xmlpath): #os. Remove (Self.xmlpath) def useragent (self,url): I_headers = {"User-agent": "mozilla/5.0 (Windows NT 6.1 ; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/36.0.1985.125 safari/537.36 "," Referer ": ' http://baidu.com/'} req = Urllib2. Request (URL, headers=i_headers) HTML = Urllib2.urlopen (req). Read () return HTML def enterpage (self,url): pattern = Re.comp Ile (R ' d{4}sd{2}sd{2}sd{2}sd{2} ') rsp=self.useragent (URL) soup=beautifulsoup (RSP) TiMespan=soup.find (' div ', {' class ': ' Blogstat '}) Timespan=str (TimeSpan). Strip (). replace (' n ', '). Decode (' Utf-8 ') Match=re.search (R ' d{4}sd{2}sd{2}sd{2}sd{2} ', TimeSpan) Timestr=str (Datetime.date.today ()) if match:timestr= Match.group () #print timestrititle=soup.title.stringdiv=soup.find (' div ', {' class ': ' Blogcontent '}) rss= Pyrss2gen.rssitem (title=ititle,link=url, description = str (div), pubDate = timestr) return rss def getcontent (self): rsp= Self.useragent (Self.baseurl) soup=beautifulsoup (RSP) ul=soup.find (' div ', {' id ': ' recentblogs '}) for Li in Ul.findall (' Li '):d iv=li.find (' div ') if Div is not none:alink=div.find (' a ') if ALink are not None:link=alink.get (' href ') print linkhtml= Self.enterpage (link) self.myrss.items.append (HTML) def saverssfile (self,filename): Finallxml=self.myrss.to_xml ( encoding= ' Utf-8 ') file=open (Self.xmlpath, ' W ') File.writelines (finallxml) file.close () if __name__== ' __main__ ': Rssspider=rssspider () rssspider.getcontent () rssspider.saverssfile (' Oschina.xml ')
The above mentioned is the whole content of this article, I hope you can like.