#-*-Coding: UTF-8 -*- From bs4 import BeautifulSoup Import urllib2 Import datetime Import time Import PyRSS2Gen From email. Utils import formatdate Import re Import sys Import OS Reload (sys) Sys. setdefaultencoding ('utf-8 ') Class RssSpider (): Def _ init _ (self ): Self. myrss = PyRSS2Gen. RSS2 (title = 'ossina ', Link = 'HTTP: // my.oschina.net ', Description = str (datetime. date. today ()), PubDate = datetime. datetime. now (), LastBuildDate = datetime. datetime. now (), Items = [] ) Self. xmlpath = R'/var/www/myrss/oschina. xml' Self. baseurl = "http://www.oschina.net/blog" # If OS. path. isfile (self. xmlpath ): # OS. remove (self. xmlpath) Def useragent (self, url ): I _headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36 ", "Referer": 'http: // baidu.com /'} Req = urllib2.Request (url, headers = I _headers) Html = urllib2.urlopen (req). read () Return html Def enterpage (self, url ): Pattern = re. compile (r 'd {4} Sd {2} Sd {2} sd {2} Sd {2} Sd {2 }') Rsp = self. useragent (url) Soup = BeautifulSoup (rsp) Timespan = soup. find ('div ', {'class': 'blogstat '}) Timespan = str (timespan). strip (). replace ('n', ''). decode ('utf-8 ') Match = re. search (r 'd {4} Sd {2} Sd {2} sd {2} Sd {2} ', timespan) Timestr = str (datetime. date. today ()) If match: Timestr = match. group () # Print timestr Ititle = soup. title. string Div = soup. find ('div ', {'class': 'blogcontent '}) Rss = PyRSS2Gen. RSSItem ( Title = ititle, Link = url, Description = str (div ), PubDate = timestr ) Return rss Def getcontent (self ): Rsp = self. useragent (self. baseurl) Soup = BeautifulSoup (rsp) Ul = soup. find ('div ', {'id': 'centblogs '}) For li in ul. findAll ('lil '): Div = li. find ('div ') If div is not None: Alink = div. find ('A ') If alink is not None: Link = alink. get ('href ') Print link Html = self. enterpage (link) Self. myrss. items. append (html) Def SaveRssFile (self, filename ): Finallxml = self. myrss. to_xml (encoding = 'utf-8 ') File = open (self. xmlpath, 'w ') File. writelines (finallxml) File. close () If _ name __= = '_ main __': RssSpider = RssSpider () RssSpider. getcontent () RssSpider. SaveRssFile ('oschina. xml ') |