Grab watercress Reading (http://book.douban.com/) Themost popular books, sorted by rating, and saved to TXT file, need to crawl the name of the book, author, rating, genre and a comment on the sentence
#coding =utf-8from selenium import webdriverfrom time import sleepclass Doubanpopularbook: def __init__ (self): self.dr = webdriver. Chrome () self.popular_books_list = self.get_douban_ Popular_books () def get_douban_popular_books (self): self.dr.get (' https://book.douban.com/') sleep (3) popular_books_list = [] # Define an empty list to hold the acquired book information i = 0 while i < 10: book_info = self.dr.find_elements_by_css_selector ("[class= ' List-col list-col2 List-summary s ']>li ") [i].text #通过css用class属性和标签li组合来获取书籍所有文本信息 popular_books_list.append (book_info.split (' \ n ')) # Append book information to an empty list separated by a newline character i += 1 popular_books_list.sort (Key=lambda x:float (X[1][0:2]), reverse=true) #用sort中key方法根据书籍评分从高到低进行排序 #sorted ( Popular_books_list, key=lambda book: book[1], reverse=true) return popular_books_list def get_popular_books_rank_file ( Self): self.file_title = ' watercress list of top rated books ' self.file = open (self.file_title + '. txt ', ' WB ') for item in self.popular_books_list: separate_line = ' ~~~~~~~~~~~~~~~~~~~~~~~~\n ' self.file.write (Separate_line.encode (' Utf-8 ')) self.file.write (' Book Name: ' +item[0]+ ' \ n '). Encode (' Utf-8 ')) self.file.write (' Score: ' +item[1]+ ' \ n '). Encode (' Utf-8 ')) self.file.write ((item[ 2]+ ' \ n '). Encode (' Utf-8 ')) Self.file.write (' Genre: ' +item[3]+ ' \ n '). Encode (' Utf-8 ')) if item[4] == ' have ebook ': &nbSp;self.file.write (' A remark: ' +item[5]+ ' \ n '). Encode (' Utf-8 ')) else: self.file.write (' A remark: ' +item[4]+ ' \ n '). Encode (' Utf-8 ')) self.file.close () def quit (self): self.dr.quit () if __name__ == ' __main__ ': popular_books = doubanpopularbook () popular_books.get_popular_books_rank_file () popular_books.quit ()
The page is as follows:
650) this.width=650; "Src=" http://images2015.cnblogs.com/blog/942023/201612/942023-20161214221546354-484427625. PNG "alt=" 942023-20161214221546354-484427625.png "/>
650) this.width=650; "Src=" http://images2015.cnblogs.com/blog/942023/201612/942023-20161214221630354-618661845. PNG "alt=" 942023-20161214221630354-618661845.png "/>
The resulting TXT effect is as follows:
650) this.width=650; "Src=" https://attachments.tower.im/tower/4e81fb69a5814918bbf4f37fa507d33e/?filename= Clipboard%20image.png "alt="? Filename=clipboard%20image.png "/>
650) this.width=650; "Src=" https://attachments.tower.im/tower/6cd6b00927d6463fa4b30584380258d3/?filename= Clipboard%20image.png "alt="? Filename=clipboard%20image.png "/>
This article comes from "No idea, no achievement!" "Blog, be sure to keep this provenance http://kemixing.blog.51cto.com/10774787/1882855
Using Python+selenium to crawl the most popular books in watercress reading, sorted by rating