1 # Coding = UTF-8 2 Import requests 3 from bs4 import beautifulsoup 4 5 m = input ("Enter the number of pages to capture:") 6 for I in range (1, INT (M): 7 url = "https://www.jianshu.com /? Page = "+ STR (I) 8 headers = {9 'user-agent': 'mozilla/5.0 (Windows NT 10.0; win64; x64; RV: 62.0) gecko/20100101 Firefox/62.0 ', 10 'accept': 'text/html, */*; q = 000000', 11 'accept-language': 'zh-CN, zh; q = 0.8, ZH-tw; q = 0.7, ZH-HK; q = 0.5, en-US; q = 0.3, en; q = 0.2 ', 12 'Accept-encoding': 'gzip, deflate', 13 'Referer': 'https: // www.jianshu.com/',14' X-infinitescroll ': 'true ', 15 'x-requested-with': 'xmlhttprequest ', 16 'connection': 'close', 17} 18 html = requests. get (url = URL, headers = headers) 19 soup = beautifulsoup (HTML. text. encode (HTML. encoding ). decode ('utf-8'), 'html. parser ') 20 # print html21 # print (soup. pretsums () 22 titles = soup. find_all ('A', 'title') 23 titlesp = soup. find_all ('P', 'abstract ') 24 with open (R "./.txt "," A ", encoding = 'utf-8') as file: 25 for (title, titlep) in zip (titles, titlesp): 26 file. write (title. string + '\ n') 27 file. write (titlep. string + '\ n') 28 file. write ("https://www.jianshu.com" + title. get ('href ') +' \ n \ n') </code> 29 30 print ("the execution is complete and saved in the directory :. /Chapter Summary. txt ")
Environment: python3
Module: requests, bs4
Crawlers of a simple book can set the page number and capture the article title, introduction, and link.