Python crawler-log on to zhihu and python Crawler
#! /Usr/bin/env python3 #-*-coding: UTF-8-*-''' Required-requests (Required)-pillow (optional) ''' import requeststry: import cookielibexcept: import http. cookiejar as cookielibimport reimport timeimport OS. pathtry: from PIL import image%t: pass # Use the verification code to identify the database # import pytesseract # construct the Request headersagent = 'mozilla/5.0 (Windows NT 5.1; rv: 33.0) gecko/20100101 Firefox/33.0 'headers = {'user-agent': Agent} # log on to cooki E information session = requests. session () session. cookies = cookielib. LWPCookieJar (filename = 'cookies ') try: session. cookies. load (ignore_discard = True) failed T: print ("Cookie failed to load") def get_xsrf (): ''' _ xsrf is a dynamically changing parameter ''' index_url = 'https: // www.zhihu.com '# obtain the _ xsrf index_page = session required for logon. get (index_url, headers = headers) html = index_page.text pattern = r'name = "_ xsrf" value = "(. *?) "'# Here _ xsrf returns a list _ xsrf = re. findall (pattern, html) return _ xsrf [0] # obtain the verification code def get_captcha (): t = str (int (time. time () * 1000) captcha_url = 'https: // www.zhihu.com/captcha.gif? R = '+ t + "& type = login" r = session. get (captcha_url, headers = headers) with open('captcha.jpg ', 'wb') as f: f. write (r. content) f. close () # Use pillow's Image to display the verification code # If pillow is not installed in the source code directory, locate the verification code and manually enter try: im = Image.open('captcha.jpg ') # Use the verification code for identification, the system needs to install tesseract-ocr software #: https://jaist.dl.sourceforge.net/project/tesseract-ocr-alt/tesseract-ocr-setup-3.02.02.exe # because of the verification code recognition, the test recognition rate is not high, so temporarily do not use, look for better identification method plus # code = Pytesseract. image_to_string (im) # print (code) im. show () im. close () # if len (code )! = 4: # print ('The Verification Code cannot be recognized automatically. Please enter the verification code manually! ') # Else: # captcha = code # return captchac failed T: print (U' go to the % s directory to find captcha.jpg and manually enter' % OS .path.abspath('captcha.jpg ')) captcha = input ("please input the captcha \ n>") return captchadef isLogin (): # Check the user's personal information to determine whether the user has logged on to url = "https://www.zhihu.com/settings/profile" login_code = session. get (url, allow_redirects = False ). status_code if int (x = login_code) = 200: return True else: return Falsedef login (secret, account): # Use the input user name to determine whether the account is a mobile phone number if re. match (r "^ 1 \ d {10} $", account): print ("mobile phone number logon \ n") post_url = 'https: // www.zhihu.com/login/phone_num' postdata = {'_ xsrf': get_xsrf (), 'Password': secret, 'Remember _ me': 'true', 'phone _ num': account ,} # The email can be added. else: print ("email login \ n") post_url = 'https: // www.zhihu.com/login/email' postdata = {'_ xsrf' is not added here ': get_xsrf (), 'Password': secret, 'Remember _ me': 'true', 'email ': account,} try: # log on to login_page directly without a verification code = session. post (post_url, data = postdata, headers = headers) login_code = login_page.text print (login_page.status) print (login_code) failed t: # You need to enter the verification code before logging on to postdata ["captcha"] = get_captcha () login_page = session. post (post_url, data = postdata, headers = headers) login_code = eval (login_page.text) print (login_code ['msg ']) return login_code ['R'] session. cookies. save () try: input = raw_inputexcept: passif _ name _ = '_ main _': while True: if isLogin (): print ('you have logged on to ') else: account = input ('enter your USERNAME \ n>') secret = input ("enter your password \ n> ") result = login (secret, account) if result = 0: # crawl the website content after successful login conf_url = "https://www.zhihu.com/settings/profile" text = session. get (conf_url, headers = headers ). text print (text) break
When running the above Code in python 2. *, you only need to modify the print field of the Code.
For more information about the code, see the netizens. If you have any errors or better methods, please leave a message!