python 爬取豆瓣圖書

來源:互聯網
上載者:User

標籤:fst   post   art   資料   ascii   comment   value   lis   爬蟲   

#!-*-coding:utf-8-*-import requestsimport xlwtfrom bs4 import BeautifulSoupfrom collections import OrderedDictclass DouBanBookSpider(object):    def __init__(self, book_type, quantity):        self.book_type = book_type        self.quantity = quantity        self.url_list = []        self.book_dict = OrderedDict()        self.count = 0    #擷取url    def get_url(self):        count = 0        while count < self.quantity+1:            url = ‘https://book.douban.com/tag/%s?start=%d&type=S‘ % (self.book_type, count)            self.url_list.append(url)            #每頁20本書,            count += 20        return self.url_list    #爬蟲主體    def main_spider(self, url):        rsp = requests.get(url)        tag_bf = BeautifulSoup(rsp.text, ‘lxml‘)        content = tag_bf.find_all(‘li‘, class_=‘subject-item‘)        if content:            for i in content:                bt_bf = BeautifulSoup(str(i), ‘lxml‘)                self.count += 1                book_name = bt_bf.h2.a.get_text(strip=True)                author = bt_bf.find(‘div‘, class_=‘pub‘).string.strip()                comment_info = bt_bf.find(‘div‘, class_=‘star clearfix‘)                co_bf = BeautifulSoup(str(comment_info), ‘lxml‘)                grade = co_bf.find(‘span‘, class_=‘rating_nums‘)                if grade:                    grade = grade.string                comment_count = co_bf.find(‘span‘, class_=‘pl‘).string.strip()                self.book_dict[str(self.count)] = {‘序號‘: self.count, ‘書名‘: book_name, ‘評分‘: grade, ‘評論數‘: comment_count, ‘作者‘: author}        else:            return    #執行爬蟲    def do_spider(self):        for i in self.get_url():            self.main_spider(i)    #資料寫入excel    def write_excel(self):        wb = xlwt.Workbook(encoding=‘ascii‘)        ws = wb.add_sheet(self.book_type)        style = xlwt.XFStyle()        font = xlwt.Font()        font.name = ‘Times New Roman‘        font.bold = True        style.font = font        row0 = [‘序號‘, ‘書名‘, ‘評分‘, ‘評論數‘, ‘出版資訊‘]        for i in range(0, len(row0)):            ws.write(0, i, row0[i], style)        for k, v in self.book_dict.items():            for j in range(0, len(v.values())):                ws.write(int(k), j, list(v.values())[j])        wb.save(‘%s.xlsx‘ % self.book_type)if __name__ == "__main__":    ds = DouBanBookSpider(‘中國歷史‘, 2000)    ds.do_spider()    ds.write_excel()

 

python 爬取豆瓣圖書

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.