QQ space Python crawler (3) --- final chapter, python Crawler
After testing, the code in the previous section is successfully run, and the next step is to add loop crawling to get all the instructions -. -
Complete code:
1 import requests 2 import json 3 import OS 4 import shutil 5 import time 6 7 qq = 627911861 8 9 headers = {10 'accept': 'text/html, application/xhtml + xml, application/xml; q = 0.9, image/webp, image/apng, */*; q = 0.8 ', 11 'Accept-encoding ': 'gzip, deflate, br ', 12 'Accept-color': 'zh-CN, zh; q = 000000', 13 'cache-control ': 'max-age = 0', 14 'cookies': 'xxxx', 15 'upgrade-insecure-requests': '1', 16 'user-agent': 'moz Illa/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) chrome/60.0.3112.113 Mobile Safari/537.36 '17} 18 19 url_x = 'https: // mobile.qzone.qq.com/list? Qzonetoken = listen & g_tk = 1573033187 & res_attach = att % 3D '20 url_y = '% 26tl % 3D1508257557 & format = json & list_type = shuoshuo & action = 0 & res_uin = 627911861 & count = 40'21 numbers = 0 # 'view more' flip page 22 img_set = set () # Save image url set 23 word_count = 0 # text talk counter 24 words = "" # Save text talk 25 images = "" # Save image url26 page = int (1761/40) 27 28 29 for I in range (0, page): 30 try: 31 html = requests. get (url_x + str (numbers) + url_y, headers = headers ). content32 data = json. loads (html) 33 # print (data) 34 35 for vFeed in data ['data'] ['vfeeds ']: 36 if 'pic' in vFeed: 37 for pic in vFeed ['pic '] ['picdata'] ['pic']: 38 img_set.add (pic ['photourl'] ['0'] ['url']) 39 40 if 'summary 'in vFeed: 41 # print (str (word_count) + '. '+ vFeed ['summary'] ['summary ']) 42 words + = str (word_count) + '. '+ vFeed ['summary'] ['summary '] +' \ r \ n' 43 word_count + = 144 bytes T: 45 print ('error ') 46 47 numbers ++ = 4048 time. sleep (10) 49 50 try: 51 with open (OS. getcwd () + '\' + str (qq) + '.txt ', 'wb') as fo: 52 fo. write (words. encode ('utf-8') 53 print ("text description written") 54 55 with open (OS. getcwd () + '\' + 'images _ url', 'wb') as foImg: 56 for imgUrl in img_set: 57 images + = imgUrl + '\ r \ n' 58 foImg. write (images. encode ('utf-8') 59 print ("image written") 60 61 failed T: 62 print ('data written error') 63 64 65 if not img_set: 66 print (u'do not have picture description') 67 else: 68 image_path = OS. getcwd () + '\ images' 69 if OS. path. exists (image_path) is False: 70 OS. mkdir (image_path) 71 x = 172 for imgUrl in img_set: 73 temp = image_path + '/Hangzhou s.jpg' % x74 print (U' % s picture % s '% x) 75 try: 76 r = requests. get (imgUrl, stream = True) 77 if r. status_code = 200: 78 with open (temp, 'wb ') as f: 79 r. raw. decode_content = True80 shutil. copyfileobj (r. raw, f) 81 bytes T: 82 print (U' this image download failed: % s' % imgUrl) 83 x + = 1