Original address: Use Python to crawl all the data on the home page of the blog, and regularly continue to crawl the newly released content into MongoDB
Dependency Package: 1.jieba2.pymongo3.htmlparser#-*-coding:utf-8-*-"" "@author: Jiangfuqiang" "" from Htmlparser Import Htmlparserimport reimport timefrom datetime import dateimport pymongoimport urllib2import sysimport tracebackimport Jieb adefault_encoding = ' Utf-8 ' if sys.getdefaultencoding ()! = default_encoding:reload (SYS) sys.setdefaultencoding (defau lt_encoding) isexist = Falseclass Fetchcnblog (htmlparser): Def __init__ (self, id): htmlparser.__init__ (self) Self.result = [] Self.data = {} Self.istitlelink = False self.id = id self.issummary = Fals E Self.ispostitem = False Self.isarticleview = False def handle_data (self, data): if Self.istitlel Ink and self.ispostitem:self.data[' title '] = Data Self.istitlelink = False elif self.issumma ry and Self.isPostItem:data = Data.strip () if data:self.data[' desc '] = data def Handle_starttag (self, Tag, attrs): if tag = = ' A ': for key, value in attrs:if key = = ' class ': if value = = ' Titl Elnk ': Self.istitlelink = True elif value = = ' Gray ' and Self.isarticleview: Self.isarticleview = False for key, value in Attrs:i F key = = ' href ': self.data[' readmorelink ' = value reg = ' d+ ' result = Re.search (reg,value) Self.ispostitem = False If result:self.data[' id '] = Int (Result.group ()) Else:self.data = {} return If self.data[' id '] <= self.id:self.data = {}Isexist = True return Else: self.data[' srouce ' = "www.cnblogs.com" self.data[' source_key '] = ' cnblogs ' self.data[' fetchtime ' = str (date.today ()) self.data[' Ke Yword '] = ",". Join (Jieba.cut (self.data[' title ')) Self.result.append (Self.data) Self.data = {} elif tag = = ' P ': for key, value in Attrs:if K EY = = ' class ' and value = = ' Post_item_summary ': self.issummary = True elif Tag = = ' img ': For key, value in attrs:if key = = ' class ' and value = = ' PFS ': for key, value in ATT Rs:if key = = ' src ': self.data[' imgsrc ' = value elif tag = = ' div ': for Key, VAlue in attrs:if key = = ' class ' and value = = ' Post_item_foot ': self.issummary = False elif key = = ' class ' and value = = ' Post_item ': Self.ispostitem = True elif Tag = = ' Span ': For key, value in attrs:if key = = ' class ' and value = = ' Article_view ': Self.isarticleview = True def getresult (self): return self.resultif __name__ = = "__main__": con = Pymongo. Connection (' localhost ', 27017) db = Con.blog Fetchblog = db.fetch_blog record = Db.record URL = "http://www.cn blogs.com/sitehome/p/%d "Count = 1 flag = False headers={' user-agent ': ' mozilla/5.0 (Windows; U; wind ows NT 6.1; en-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '} reco = Record.find_one ({"type": ' Cnblogs '}) id = 0 I F reco:id = reco[' Maxid ') while isexist = = False:try:req = Urllib2. Request (url%count,headers=headers) reQuest = Urllib2.urlopen (req) data = Request.read () FJ = Fetchcnblog (ID) fj.feed (data) result = Fj.getresult () If Len (result) < 1:isexist = True Else: If flag = = False:flag = True dic = result[0] id = int (dic[ ' ID ']) record.update ({"type": ' Cnblogs '},{"$set": {' Maxid ': id}},true,false) result.reverse () for Doc in Result:fetchblog.insert (DOC) print "page is%d"%count Count + = 1 time.sleep (5) except Exception, E:traceback.print_exc () Print "Parse error", e program if executed under Linux,mac, in the CRONTAB-E can set the timer task, if it is executed in Windows, then you add a timer in the program