The original address: Use Python to crawl all the data on the home page of the blog, and regularly continue to crawl new published content into MongoDB
Dependency Package: 1.jieba2.pymongo3.htmlparser#-*-coding:utf-8-*-"" "@author: Jiangfuqiang" "" from Htmlparser Import Htmlparserimport reimport timefrom datetime import dateimport pymongoimport urllib2import sysimport tracebackimport Jieb adefault_encoding = ' Utf-8 ' if sys.getdefaultencoding ()! = default_encoding:reload (SYS) sys.setdefaultencoding (defau lt_encoding) isexist = Falseclass Fetchcnblog (htmlparser): Def __init__ (self, id): htmlparser.__init__ (self) Self.result = [] Self.data = {} Self.istitlelink = False self.id = id self.issummary = Fals E Self.ispostitem = False Self.isarticleview = False def handle_data (self, data): if Self.istitlel Ink and self.ispostitem:self.data[' title '] = Data Self.istitlelink = False elif self.issumma ry and Self.isPostItem:data = Data.strip () if data:self.data[' desc '] = data def Handle_starttag (self, Tag, attrs): if tag = = ' A ': for key, value in attrs:if key = = ' class ': if value = = ' Titl Elnk ': Self.istitlelink = True elif value = = ' Gray ' and Self.isarticleview: Self.isarticleview = False for key, value in Attrs:i F key = = ' href ': self.data[' readmorelink ' = value reg = ' d+ ' result = Re.search (reg,value) Self.ispostitem = False If result:self.data[' id '] = Int (Result.group ()) Else:self.data = {} return If self.data[' id '] <= self.id:self.data = {}Isexist = True return Else: self.data[' srouce ' = "www.cnblogs.com" self.data[' source_key '] = ' cnblogs ' self.data[' fetchtime ' = str (date.today ()) self.data[' Ke Yword '] = ",". Join (Jieba.cut (self.data[' title ')) Self.result.append (Self.data) Self.data = {} elif tag = = ' P ': for key, value in Attrs:if K EY = = ' class ' and value = = ' Post_item_summary ': self.issummary = True elif Tag = = ' img ': For key, value in attrs:if key = = ' class ' and value = = ' PFS ': for key, value in ATT Rs:if key = = ' src ': self.data[' imgsrc ' = value elif tag = = ' div ': for Key, VAlue in attrs:if key = = ' class ' and value = = ' Post_item_foot ': self.issummary = False elif key = = ' class ' and value = = ' Post_item ': Self.ispostitem = True elif Tag = = ' Span ': For key, value in attrs:if key = = ' class ' and value = = ' Article_view ': Self.isarticleview = True def getresult (self): return self.resultif __name__ = = "__main__": con = Pymongo. Connection (' localhost ', 27017) db = Con.blog Fetchblog = db.fetch_blog record = Db.record URL = "http://www.cn blogs.com/sitehome/p/%d "Count = 1 flag = False headers={' user-agent ': ' mozilla/5.0 (Windows; U; wind ows NT 6.1; en-us. rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '} reco = Record.find_one ({"type": ' Cnblogs '}) id = 0 if Reco:id = reco[' Maxid '] while isexist = = False:try:req = Urllib2. Request (url%count,headers=headers) reQuest = Urllib2.urlopen (req) data = Request.read () FJ = Fetchcnblog (ID) fj.feed (data) result = Fj.getresult () If Len (result) < 1:isexist = True Else: If flag = = False:flag = True dic = result[0] id = int (dic[ ' ID ']) record.update ({"type": ' Cnblogs '},{"$set": {' Maxid ': id}},true,false) result.reverse () for Doc in Result:fetchblog.insert (DOC) print "page is%d"%count Count + = 1 time.sleep (5) except Exception, E:traceback.print_exc () Print "Parse Error", the E program is assumed to run under Linux,mac. When you can set a timer task in crontab-e, assuming that you run it in Windows, you can add
Use Python to crawl all the data on the home page of the blog, and constantly crawl the newly announced content into MongoDB