Import requests
From pyquery import Pyquery as PQ
url = ' Https://www.zhihu.com/explore '
headers = {
' User-agent ':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/67.0.3396.99 safari/537.36 "
}
# to allow Web pages to simulate the operation of the browser to set up a headers to get the Web page source
html = requests.get (URL, headers=headers). Text
# Initialize, use Pyquery to put HTML into the parsing library for parsing
Doc = PQ (HTML)
# Pyquery parsing (with CSS selector parameters in it) has two parameters in class to parse
Items = Doc ('. Explore-feed.feed-item '). Items ()
# Iterate through filtered data
For item in items:
# Extract the questions inside
Question = Item.find (' h2 '). Text ()
# Extract the author inside
Author = Item.find ('. Author-link-line '). Text ()
# Extract the contents of the reply inside, note here, there is a textarea on the top of the content is hidden
Answer = PQ (Item.find ('. Content '). html ()). Text ()
# method One
# The storage of the file is stored in txt text
File = open (' Explore.txt ', ' a ', encoding= ' utf-8 ')
# Write to File
File.write (' \ n '. Join ([question, author, answer]))
# Each content is separated by a special symbol
File.write (' \ n ' + ' = ' * + ' \ n ')
# file is closed
File.close ()
# Way Two
# shorthand method so that you do not have to close the file, the system has encapsulated the method of closing
With open (' Explore.txt ', ' a ', encoding= ' Utf-8 ') as file:
File.write (' \ n '. Join ([question, author, answer]))
File.write (' \ n ' + ' = ' * + ' \ n ')
Python knows how to crawl hot Topic data