標籤:fst post art 資料 ascii comment value lis 爬蟲
#!-*-coding:utf-8-*-import requestsimport xlwtfrom bs4 import BeautifulSoupfrom collections import OrderedDictclass DouBanBookSpider(object): def __init__(self, book_type, quantity): self.book_type = book_type self.quantity = quantity self.url_list = [] self.book_dict = OrderedDict() self.count = 0 #擷取url def get_url(self): count = 0 while count < self.quantity+1: url = ‘https://book.douban.com/tag/%s?start=%d&type=S‘ % (self.book_type, count) self.url_list.append(url) #每頁20本書, count += 20 return self.url_list #爬蟲主體 def main_spider(self, url): rsp = requests.get(url) tag_bf = BeautifulSoup(rsp.text, ‘lxml‘) content = tag_bf.find_all(‘li‘, class_=‘subject-item‘) if content: for i in content: bt_bf = BeautifulSoup(str(i), ‘lxml‘) self.count += 1 book_name = bt_bf.h2.a.get_text(strip=True) author = bt_bf.find(‘div‘, class_=‘pub‘).string.strip() comment_info = bt_bf.find(‘div‘, class_=‘star clearfix‘) co_bf = BeautifulSoup(str(comment_info), ‘lxml‘) grade = co_bf.find(‘span‘, class_=‘rating_nums‘) if grade: grade = grade.string comment_count = co_bf.find(‘span‘, class_=‘pl‘).string.strip() self.book_dict[str(self.count)] = {‘序號‘: self.count, ‘書名‘: book_name, ‘評分‘: grade, ‘評論數‘: comment_count, ‘作者‘: author} else: return #執行爬蟲 def do_spider(self): for i in self.get_url(): self.main_spider(i) #資料寫入excel def write_excel(self): wb = xlwt.Workbook(encoding=‘ascii‘) ws = wb.add_sheet(self.book_type) style = xlwt.XFStyle() font = xlwt.Font() font.name = ‘Times New Roman‘ font.bold = True style.font = font row0 = [‘序號‘, ‘書名‘, ‘評分‘, ‘評論數‘, ‘出版資訊‘] for i in range(0, len(row0)): ws.write(0, i, row0[i], style) for k, v in self.book_dict.items(): for j in range(0, len(v.values())): ws.write(int(k), j, list(v.values())[j]) wb.save(‘%s.xlsx‘ % self.book_type)if __name__ == "__main__": ds = DouBanBookSpider(‘中國歷史‘, 2000) ds.do_spider() ds.write_excel()
python 爬取豆瓣圖書