Dependent Packages:
1.pymongo
2.jieba
#-*-Coding:utf-8-*-
"""
@author: Jiangfuqiang
"""
From Htmlparser import Htmlparser
Import Urllib2
Import Sys
Import Pymongo
Import time
Import Jieba
Import Traceback
default_encoding = ' Utf-8 '
If sys.getdefaultencoding ()! = default_encoding:
Reload (SYS)
Sys.setdefaultencoding (default_encoding)
Class Fetchjobble (Htmlparser):
def __init__ (self):
Htmlparser.__init__ (self)
Self.ispostthumb = False
Self.ispostmeta = False
Self.ismetatitle = False
Self.iscategorytag = False
Self.iscomment = False
Self.isexcerpt = False
Self.isreadmore = False
Self.ispicture = False
Self.data = {}
Self.result = []
def handle_starttag (self,tag,attrs):
if tag = = ' div ':
For Key,value in Attrs:
If key = = ' class ':
if value = = ' Post-thumb ':
Self.ispostthumb = True
elif value = = ' Meta-title ':
Self.ismetatitle = True
elif Tag = = ' A ' and self.ispostthumb = = True:
For key, value in Attrs:
If Self.isreadmore:
If key = = ' href ':
self.data[' redmorelink ' = value
self.data[' keyword ' = ",". Join (Jieba.cut (self.data[' title '))
Self.result.append (Self.data)
Self.ispostthumb = False
Self.ismetatitle = False
Self.isreadmore = False
Self.iscategorytag = False
Self.iscomment = False
Self.isexcerpt = False
Self.ispicture = False
Self.data = {}
Else
If key = = ' class ':
if value = = ' Meta-title ':
Self.ismetatitle = True
elif key = = ' rel ':
if value = = ' category tag ':
Self.iscategorytag = True
elif key = = ' href ':
If Value.find (' #respond ') > 0:
Self.iscomment = True
elif Tag = = ' span ' and self.iscomment = = True:
For key, value in Attrs:
If key = = ' class ' and value = = ' Excerpt ':
Self.isexcerpt = True
elif key = = ' class ' and value = = ' Read-more ':
Self.isreadmore = True
elif tag = = ' img ' and self.ispostthumb and Self.ispostmeta = = False:
For key, value in Attrs:
If key = = ' src ':
self.data[' imgsrc ' = value
def handle_endtag (Self,tag):
Pass
def handle_data (Self,data):
If Self.ismetatitle:
self.data[' title ' = Data
Self.ismetatitle = False
Elif Self.iscategorytag:
CT = "
If ' tag ' in Self.data.keys ():
ct = self.data[' tag ' + "," + data
Else
CT = data
self.data[' tag '] = ct
Self.iscategorytag = False
Elif self.iscomment and ' comment ' not in Self.data.keys ():
self.data[' comment ' = Data.split ("") [0]
Elif Self.isexcerpt:
self.data[' desc '] = data
Self.isexcerpt = False
def getresult (self):
Return Self.result
if __name__ = = "__main__":
Con = Pymongo. Connection (' localhost ', 27017)
db = Con.blog
Fetchblog = Db.fetch_blog
url = "http://blog.jobbole.com/all-posts/page/%d"
Count = 1
Flag = False
headers={
' User-agent ': ' mozilla/5.0 (Windows; U; Windows NT 6.1; en-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '}
While flag = = False:
Try
req = Urllib2. Request (Url%count,headers=headers)
Request = Urllib2.urlopen (req)
data = Request.read ()
FJ = fetchjobble ()
Fj.feed (data)
result = Fj.getresult ()
If Len (Result) < 1:
Flag = True
Else
For doc in result:
Fetchblog.insert (DOC)
Print "page is%d"%count
Count + = 1
Time.sleep (5)
Except Exception, E:
Traceback.print_exc ()
Print "Parse error", E
Python captures all of Bó Lè 's online articles, and stores it in MongoDB after the title participle