Recently used scrapy simulation landing, found that all the interface has changed, including the verification code has also undergone a great change, through the capture packet analysis, record the revision of the knowledge of the simulation landing, nonsense not to say, directly on the code, pro-Test effective
#-*-Coding:utf-8-*-from PIL import imagefrom scrapy.exceptions import closespiderimport scrapyimport jsonimport base64 Class Zhihuspider (Scrapy. Spider): name = ' Zhihu ' allowed_domains = [' www.zhihu.com '] start_urls = [' http://www.zhihu.com/'] handle_http Status_list = [401, 403] client_id = ' c3cef7c66a1843f8b3a9e6a1e3160e20 ' #固定不变 signature = ' B858d0c8b1f2e86c6cb0d9 3d4055963bcf1121ec ' #抓包获取 timestamp = ' 1519567594106 ' #抓包获取 headers = {"HOST": "Www.zhihu.com", "Referer": "Https://www.zhihu.com/signup?next=%2F", "user-agent": "mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) applewebkit/537.36 (khtml, like Gecko) chrome/63.0.3239.132 safari/537.36name "," Authorizat Ion ":" OAuth C3cef7c66a1843f8b3a9e6a1e3160e20 ",} def parse (self, Response): Pass Def start_requests (self) : ' Get landing page, Set_cookie:return: ' ' return [scrapy. Request (url= ' Https://www.zhihu.com/signup?next=%2F ', Headers=self.headers, method= "GET", meta={' Cookiejar ': 1}, Callback=self.post_captchareq, Dont_f Ilter=true,)] def post_captchareq (self, Response): "Sends a request to get a CAPTCHA:p Aram Response:: return: ' Return [Scrapy. Request (url= ' https://www.zhihu.com/api/v3/oauth/captcha?lang=en ', headers=self.headers, meta={' Cookiejar ': response.meta[' Cookiejar '}, Dont_filter=true, Callback=self.deal_captchareq, ] def deal_captchareq (self, Response): "" To determine if a CAPTCHA is required:p Aram response:: return: "' Json_res = Json.loads (response.text) Post_data = {" client_id ": self.client_id," G Rant_type ":" Password "," timestamp ": Self.timestamp," source ": "Com.zhihu.web", "signature": Self.signature, "username": ' +86 your mobile number ', ' Password ': ' Password ', "Captcha": "," Lang ":" en "," Ref_source ":" homepage "," Utm_source ":" "} If Json_res.get ("Show_captcha", None): return [Scrapy. Request (url= ' https://www.zhihu.com/api/v3/oauth/captcha?lang=en ', headers=self.head ERs, method= ' PUT ', meta={' Cookiejar ': response.meta[' Cookiejar '], ' Post_data ':p ost_data}, callback=self.get_captchaimg)] Retu RN [Scrapy. Formrequest (url= "https://www.zhihu.com/api/v3/oauth/sign_in", Formdata=post_data, Method= "POST", Headers=self.headers, meta={' Cookiejar ': response.meta[' Cookiejar ']}, Callback=self.check_Login, Dont_filter=true,)] def get_captchaimg (self, Response): "' Get verification Code picture stream data, manually enter the verification code:p Aram Response:: Return: "' Post_data = response.meta[' Post_data '] tr y:json_img = Json.loads (response.text) bs64_img = json_img["Img_base64"] bs64_img = Bs64 _img.encode (' utf-8 ') Img_steam = Base64.b64decode (bs64_img) with open ("Zhihucaptcha.jpg", ' WB ') as F : F.write (img_steam) img = Image.open ("zhihucaptcha.jpg") img.show () input _captcha = input ("Please enter the verification code in the figure:"). Strip () post_data[' captcha '] = Input_captcha img.close () PO St_code = {"Input_text": Input_captcha,} return [Scrapy. Formrequest (url= "https://www.zhihu.com/api/v3/oauth/captcha?lang=en", Formdata=post _code, Headers=seLf.headers, method= ' POST ', meta={' Cookiejar ': response.meta[' Cookiejar '], ' Post_data ':p ost_data}, Callback=self.post_captcha, Dont_filter=true, )] except Exception as E:raise closespider (' Get captcha error: {error} '. Format (Error =E) def post_captcha (self, Response): "Send user authentication information to login:p Aram response:: Return:" Post_data = Response.meta.get (' post_data ') if Json.loads (Response.text). Get (' success '): return [ Scrapy. Formrequest (url= "https://www.zhihu.com/api/v3/oauth/sign_in", Formdata=post_data, Headers=self.headers, method= ' POST ', meta={' Cookiejar ': RESPONSE.M eta[' Cookiejar '}, Callback=self.check_login, Dont_filter=true,) ] Else:raise closespider (' Authenticode incorrect ') def check_login (self, Response): #验证是否登陆成功 PR Int (' ==============> ', response.text) print (response.status) if response.status = = 201:self.lo Gger.info ("Landing success!") Else:raise closespider (' Login info wrong! ‘)
Among them, other parameters such as client_id, OAuth, etc. are fixed, signature and timestamp with time stamp change, it is used to verify the token of legitimate users, the essence is a client's JS run generated, here for convenience, Get the signature of a fixed timestamp directly by grabbing a packet
First enter the wrong account information on the PC side, grab the packet to get timestamp and signature, replace the corresponding can
Scrapy automatic landing of the revised API interface