python抓取伯樂線上的全部文章,對標題分詞後存入mongodb中

來源:互聯網
上載者:User

標籤:des   blog   http   io   os   ar   for   sp   div   

依賴包:

1.pymongo

2.jieba


# -*- coding: utf-8 -*-

"""
@author: jiangfuqiang
"""
from HTMLParser import HTMLParser
import urllib2
import sys
import pymongo
import time
import jieba
import traceback

default_encoding = ‘utf-8‘
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)
class FetchJobble(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.isPostThumb = False
        self.isPostMeta = False
        self.isMetaTitle = False
        self.isCategoryTag = False
        self.isComment = False
        self.isexcerpt = False
        self.isReadMore = False
        self.isPicture = False
        self.data = {}
        self.result = []

    def handle_starttag(self,tag,attrs):
        if tag == ‘div‘:
            for key,value in attrs:
                if key == ‘class‘:
                    if value == ‘post-thumb‘:
                        self.isPostThumb = True
                    elif value == ‘meta-title‘:
                        self.isMetaTitle = True
        elif tag == ‘a‘ and self.isPostThumb == True:

            for key, value in attrs:
                if self.isReadMore:
                    if key == ‘href‘:
                        self.data[‘redmoreLink‘] = value
                        self.data[‘keyword‘] = ",".join(jieba.cut(self.data[‘title‘]))
                        self.result.append(self.data)
                        self.isPostThumb = False
                        self.isMetaTitle = False
                        self.isReadMore = False
                        self.isCategoryTag = False
                        self.isComment = False
                        self.isexcerpt = False
                        self.isPicture = False

                        self.data = {}
                else:
                    if key == ‘class‘:
                        if value == ‘meta-title‘:
                            self.isMetaTitle = True
                    elif key == ‘rel‘:
                        if value == ‘category tag‘:
                            self.isCategoryTag = True
                    elif key ==‘href‘:
                        if value.find(‘#respond‘) > 0:
                            self.isComment = True
        elif tag == ‘span‘ and self.isComment == True:
            for key, value in attrs:
                if key == ‘class‘ and value == ‘excerpt‘:
                    self.isexcerpt = True
                elif key == ‘class‘ and value == ‘read-more‘:
                    self.isReadMore = True
        elif tag == ‘img‘ and self.isPostThumb and self.isPostMeta == False:
            for key, value in attrs:
                if key == ‘src‘:
                    self.data[‘imgSrc‘] = value

    def handle_endtag(self,tag):

        pass

    def handle_data(self,data):
         if self.isMetaTitle:
            self.data[‘title‘] = data
            self.isMetaTitle = False
         elif self.isCategoryTag:
             ct = ‘‘
             if ‘tag‘ in self.data.keys() :
                 ct = self.data[‘tag‘] + "," + data
             else:
                 ct = data
             self.data[‘tag‘] = ct
             self.isCategoryTag = False
         elif self.isComment and ‘comment‘ not in self.data.keys():
             self.data[‘comment‘] = data.split(" ")[0]
         elif self.isexcerpt:
             self.data[‘desc‘] = data
             self.isexcerpt = False


    def getResult(self):
        return self.result

if __name__ == "__main__":
    con = pymongo.Connection(‘localhost‘, 27017)
    db = con.blog
  
    fetchblog = db.fetch_blog

    url = "http://blog.jobbole.com/all-posts/page/%d"
    count = 1
    flag = False
    headers={
             ‘User-Agent‘:‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘}
    while flag == False:
        try:
            req = urllib2.Request(url%count,headers=headers)
            request = urllib2.urlopen(req)
            data = request.read()
            fj = FetchJobble()
            fj.feed(data)
            result = fj.getResult()
            if len(result) < 1:
                flag = True
            else:
                for doc in result:
                    fetchblog.insert(doc)
                print "page is %d"%count
                count += 1

                time.sleep(5)
        except Exception, e:
            traceback.print_exc()
            print "parse error",e

python抓取伯樂線上的全部文章,對標題分詞後存入mongodb中

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.