Grab the top three questions and the first answer to each question in the hottest and hottest of the month, and save to an HTML file with the file name 20160228_zhihu_today_hot.html, which is the date +zhihu_today_hot.html
The code is as follows:
from selenium import webdriverfrom time import sleepimport timeclass Zhihu (): def __init__ (self): self.dr = Webdriver. Chrome () self.dr.maximize_window () Self.today_hot_list = self.get_today_hot () self.month_hot_list = self.get_month_hot () def get_today_hot (self): ' Know the top 3 of today's hottest issues ' today_hot = [] i = 0 while i < 3: self.dr.get (' Https://www.zhihu.com/explore ') sleep (3) question_title = self.dr.find_elements_by_css_selector (' Div.explore-feed.feed-item>h2>a.question_link ') [i].text #获取问题 question_answer_url = self.dr.find_elements_by_css_ Selector (' Div.explore-feed.feed-item>h2>a.question_link ') [I].get_attribute (' href ') #获取问题回答的url self.dr.get (Question_answer_url) #访问问题url sleep (Ten) question_answer_innerhtml = self.dr.find_element_by_css_selector ('. Zm-editable-content.clearfix '). Get_attribute (' InnerHTML ') #获取首个回答的innerHTML today_hot.append ((question_title, question_answer_innerhtml)) i += 1 return today_hot def write_today_data (self): &nbsP; file_date = time.strftime ('%y-%m-%d ', Time.localtime (Time.time ())) self.file = open (file_date+ ' _zhihu_today_hot ' + '. html ', ' WB ') file_line = ' **********************************************<br /> ' #<br \> for a career break for item in Self.today_hot_list: self.file.write (File_line.encode (' GBK ') self.file.write (' Question: ' +item[0]+ ' <br /> ' ). Encode (' GBK ')) self.file.write (' First answer: ' +item[1]+ ' <br /> '). Encode (' GBK ')) self.file.close () def Get_month_hot (self): "The top 3 of the hottest issues of the month" month_hot = [] i = 5 # 5 tags before the hottest div of the month While i < 8: self.dr.get (' https:// Www.zhihu.com/explore#monthly-hot ') sleep (3) question_title = self.dr.find_elements_by_css_selector (' Div.explore-feed.feed-item>h2>a.question_link ') [i].text # get questions question_answer_url = self.dr.find_elements_by_css_selector (' Div.explore-feed.feed-item>h2>a.question_link ') [I].get_attribute (' href ') # get the URL of the question answer self.dr.get (Question_answer_url) # Access issues Url sleep (5) question_answer_innerhtml = self.dr.find_element_by_css_selector ('. Zm-editable-content '). Get_attribute (' InnerHTML ') # get the first answer to the Innerhtml month_hot.append (question_ title, question_answer_innerhtml)) i += 1 return month_hot def write_month_data (self): file_date = time.strftime ('%y-%m-%d ', time.localtime ( Time.time ()) self.file = open (file_date + ' _zhihu_ Mouth_hot ' + '. html ', ' WB ') file_line = '--------- -----------------------------<br /> ' for item in Self.month_hot_list: self.file.write (File_line.encode (' GBK ')) self.file.write (' question: ' + item[0] + ' <br /> '). Encode (' GBK ')) self.file.write (' The first answer: ' + item[ 1] + ' <br /> '). Encode (' GBK ') self.file.close () def quit (self): self.dr.quit () if __name__ == ' __main__ ': zhihu = zhihu () zhihu.write_today_data () zhihu.write_month_data () zhihu.quit ()
The page is as follows:
650) this.width=650; "Src=" http://images2015.cnblogs.com/blog/942023/201612/942023-20161223145305417-651441845. PNG "style=" Border:0px;font-family:verdana, Arial, Helvetica, sans-serif;font-size:14px;line-height:21px; White-space:normal;background-color:rgb (255,255,255); "/>
650) this.width=650; "Src=" http://images2015.cnblogs.com/blog/942023/201612/942023-20161223145330964-142482858. PNG "style=" Border:0px;font-family:verdana, Arial, Helvetica, sans-serif;font-size:14px;line-height:21px; White-space:normal;background-color:rgb (255,255,255); "/>
Generate HTML as follows:
650) this.width=650; "Src=" http://images2015.cnblogs.com/blog/942023/201612/942023-20161223145410948-1722519721. PNG "style=" Border:0px;font-family:verdana, Arial, Helvetica, sans-serif;font-size:14px;line-height:21px; White-space:normal;background-color:rgb (255,255,255); "/>
650) this.width=650; "Src=" http://images2015.cnblogs.com/blog/942023/201612/942023-20161223145429026-449547584. PNG "style=" Border:0px;font-family:verdana, Arial, Helvetica, sans-serif;font-size:14px;line-height:21px; White-space:normal;background-color:rgb (255,255,255); "/>
Hehe, the layout of HTML is not so good ha ~
This article comes from "No idea, no achievement!" "Blog, be sure to keep this provenance http://kemixing.blog.51cto.com/10774787/1885534
Grab the top three questions and the first answer to each question and save it to an HTML file with Python+selenium for today's hottest and hottest of the month