糗事百科python爬蟲

來源:互聯網
上載者:User

標籤:style   index   www.   4.0   exce   htm   tor   self   user   

# -*- coding: utf-8 -*-#coding=utf-8import urllibimport urllib2import reimport threadimport timeclass QSBK:    def __init__(self):        self.pageIndex=1        self.user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘        self.header={‘User-Agent‘:self.user_agent}        self.store=[]        self.enable=False    def getPage(self,pageIndex):        try:            url = ‘http://www.qiushibaike.com/hot/page/‘+str(pageIndex)            request = urllib2.Request(url,headers=self.header)            response = urllib2.urlopen(request)            pageHtml =response.read().decode(‘utf-8‘)            return pageHtml        except urllib2.URLError,e:            print ‘連結網路失敗‘+e.reason            return None    def getPageItem(self,pageIndex):        page = self.getPage(pageIndex)        if page==None:            print "頁面獲得失敗"            return  None        pattern = re.compile(‘<div class="author.*?<a.*?<img.*?</a>.*?<a.*?<h2>(.*?)</h2>.*?class="content.*?<span>\s*(.*?)\s*</span>‘,re.S)        items = re.findall(pattern, page)        pageStories = []        for item in items:            pageStories.append([item[0],item[1]])        return pageStories    def loadPage(self):        if self.enable==True:            if len(self.store)<2:                pageStories = self.getPageItem(self.pageIndex)                if pageStories!=None:                    self.store.append(pageStories)                    self.pageIndex+=1    def getOneStory(self,pageStories):        for story in pageStories:            input= raw_input()            self.loadPage()            if input==‘Q‘:                self.enable=False                return            print u‘%s %s‘%(story[0],story[1])    def start(self):        print u"正在讀取糗事百科的資料,按Q退出"        self.enable=True        self.loadPage()        nowPage=0        while self.enable:            if len(self.store)>0:                pageStore=self.store[0]                nowPage+=1                del self.store[0]                self.getOneStory(pageStore)spider =QSBK()spider.start()

 

糗事百科python爬蟲

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.