標籤:base try neu repos asc exce rom yield .json
爬取知乎Python中文社區資訊,https://zhuanlan.zhihu.com/zimei
1 import requests 2 from urllib.parse import urlencode 3 from pyquery import PyQuery as pq 4 from pymongo import MongoClient 5 import json 6 import time 7 8 base_url = ‘https://www.zhihu.com/api/v4/columns/zimei/articles?limit=10&‘ 9 headers = {10 ‘authority‘: ‘www.zhihu.com‘,11 ‘referer‘: ‘https://zhuanlan.zhihu.com/zimei‘,12 ‘origin‘: ‘https://zhuanlan.zhihu.com‘,13 ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36‘,14 }15 16 client = MongoClient()17 db = client[‘zhihu‘]18 collection = db[‘zhihu‘]19 max_page = 10020 21 22 def get_page(page):23 params = {24 ‘offset‘: page*1025 }26 url = base_url + urlencode(params)27 try:28 response = requests.get(url, headers=headers)29 if response.status_code == 200:30 31 return response.json()32 except requests.ConnectionError as e:33 print(‘Error‘, e.args)34 35 36 def parse_page(json_1):37 if json_1:38 items = json_1.get(‘data‘)39 for item in items:40 if page == 1 :41 continue42 else:43 44 zhihu = {}45 zhihu[‘name‘] = item.get(‘author‘).get(‘name‘)46 zhihu[‘title‘] = item.get(‘title‘)47 zhihu[‘text‘] = pq(item.get(‘excerpt‘)).text()48 zhihu[‘comments‘] = item.get(‘comment_count‘)49 zhihu[‘reposts‘] = item.get(‘voteup_count‘)50 zhihu[‘data‘] = time.strftime(‘%Y-%m-%d %H%:%M‘,time.localtime(item.get(‘updated‘)))51 yield zhihu52 53 def write_to_file(content):54 with open(‘zhihu.json‘,‘a‘,encoding=‘utf-8‘) as f:55 f.write(json.dumps(content,ensure_ascii=False)+‘\n‘)56 f.close()57 58 def save_to_mongo(result):59 if collection.insert(result):60 print(‘Saved to Mongo‘)61 62 63 if __name__ == ‘__main__‘:64 for page in range(1, max_page + 1):65 json_1 = get_page(page)66 67 results = parse_page(json_1)68 for result in results:69 print(result)70 write_to_file(result)71 save_to_mongo(result)
爬取知乎Python中文社區資訊