標籤:https get count androi mobile ini html_ html 擷取
# conding=utf-8from parse import parse_urlimport jsonclass DoubanSpider: def __init__(self): self.temp_url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_movie_occident_hot/items?os=android&for_mobile=1&callback=jsonp3&start={}&count=18&loc_id=108288&_=0" def get_content_list(self,html_str): #提取資料 dict_data = json.loads(html_str) content_list = dict_data["subject_collection_items"] total =dict_data["total"] return content_list,total def save_content_list(self,content_list): with open("db.json","a",encoding="utf-8") as f: for content in content_list: f.write(json.dumps(content,ensure_ascii=False)) f.write("\n") print(‘添加成功‘) def run(self): #實現主要邏輯 num = 0 total = 100 while num < total + 18: # 1.start_url start_url = self.temp_url.format(num) # 2.發送請求,擷取響應 html_str = parse_url(start_url) # 3.提取資料 content_list, total = self.get_content_list(html_str) # 4.儲存 self.save_content_list(content_list) # 5.構造下一頁的url地址,迴圈2-5步 num +=18if __name__== ‘__main__‘: douban = DoubanSpider() douban.run()
python 爬取豆瓣電影案例