#!-*-coding:utf-8-*-ImportRequestsImportXLWT fromBs4ImportBeautifulSoup fromCollectionsImportordereddictclassDoubanbookspider (object):def __init__(self, Book_type, quantity): Self.book_type=Book_type self.quantity=Quantity Self.url_list=[] self.book_dict=ordereddict () Self.count=0#Get URL defGet_url (self): Count=0 whileCount < Self.quantity+1: URL='Https://book.douban.com/tag/%s?start=%d&type=S'%(Self.book_type, Count) self.url_list.append (URL)#20 books per page,Count + = 20returnself.url_list#Crawler Body defmain_spider (self, URL): RSP=requests.get (URL) tag_bf= BeautifulSoup (Rsp.text,'lxml') Content= Tag_bf.find_all ('Li', class_='Subject-item') ifcontent: forIinchCONTENT:BT_BF= BeautifulSoup (str (i),'lxml') Self.count+ = 1Book_name= Bt_bf.h2.a.get_text (strip=True) Author= Bt_bf.find ('Div', class_='Pub'). String.strip () Comment_info= Bt_bf.find ('Div', class_='Star Clearfix') CO_BF= BeautifulSoup (str (comment_info),'lxml') Grade= Co_bf.find ('span', class_='rating_nums') ifGrade:grade=grade.string Comment_count= Co_bf.find ('span', class_='PL'). String.strip () Self.book_dict[str (Self.count)]= {'Serial Number': Self.count,'title': Book_name,'Ratings': Grade,'Number of comments': Comment_count,'author': Author}Else: return #Execution Crawler defDo_spider (self): forIinchSelf.get_url (): Self.main_spider (i)#data written to Excel defWrite_excel (self): WB= XLWT. Workbook (encoding='ASCII') WS= Wb.add_sheet (self.Book_type) style=XLWT. Xfstyle () Font=XLWT. Font () Font.Name='Times New Roman'Font.Bold=True Style.font=Font row0= ['Serial Number','title','Ratings','Number of comments','Publication Information'] forIinchRange (0, Len (row0)): Ws.write (0, I, row0[i], style) forKvinchSelf.book_dict.items (): forJinchRange (0, Len (v.values)): Ws.write (int (k), J, List (V.values ()) [j]) Wb.save ('%s.xlsx'%Self.book_type)if __name__=="__main__": DS= Doubanbookspider ('Chinese History', 2000) Ds.do_spider () Ds.write_excel ()
Python crawls the Watercress book