Introduction to the requests module of python crawler and the requests module of python Crawler
Introduction
# Introduction: You can use requests to simulate browser requests. Compared with urllib, the api of the requests module is more convenient (in essence, it encapsulates urllib3) # Note: after the requests library sends a request to download the webpage content, it does not execute js Code. This requires us to analyze the target site and then initiate a new request # Install: pip3 install requests # various request methods: Commonly Used requests. get () and requests. post () >>> import requests >>> r = requests. get ('https: // api.github.com/events') >>> r = requests. post ('HTTP: // httpbin.org/post', data = {'key': 'value'}) >>> r = requests. put ('HTTP: // httpbin.org/put', data = {'key': 'value'}) >>> r = requests. delete ('HTTP: // httpbin.org/delete') >>> r = requests. head ('HTTP: // httpbin.org/get')> r = requests. options ('HTTP: // httpbin.org/get ')
GET request
Requests. get (url = "", headers = {'user-agent': '', 'Referer':'',}, Cookie ={},) # headers: request Header Information, User-Agent: browser ID, Referer: url of the last request
POST request
# The default HTTP Request Method for GET requests is that the GET * No request body * data must be within 1 kb! * GET request data is exposed in the address bar of the browser. Common GET request operations: 1. the URL is directly provided in the address bar of the browser, so it must be a GET request 2. clicking a hyperlink on the page must also be a GET request 3. when submitting a form, the form uses the GET request by default, but can be set to POST # POST request (1 ). data will not appear in the address bar (2 ). no data size limit (3 ). request body (4 ). if the request body contains Chinese characters, it uses URL encoding! #!!! The use of requests. post () is exactly the same as that of requests. get (). Specifically, requests. post () has a data parameter to store the Request body data.
Send post requests to simulate browser login Behavior
''' A destination site analysis browser input https://github.com/login and then enter the wrong account password, capture the packet found login behavior is post submitted to: https://github.com/session and request header contains cookie and request body contains: commit: sign in utf8: login authenticity_token: login/TSJVoXWrvDZaVwxQ = login: egonlin password: 123 second process analysis first GET: login returns POST: https://github.com/session with initial cookie, with the Request body (authe Nticity_token, user name, password, etc.) finally get the login cookie ps: if the password is in the ciphertext format, you can first enter the wrong account, enter the correct password, and then get the encrypted password in the browser, the github password is the plaintext ''' import requestsimport re # The first request r1 = requests. get ('https: // github.com/login') r1_cookie = r1.cookies. get_dict () # Get the initial cookie (unauthorized) authenticity_token = re. findall (r 'name = "authenticity_token ". *? Value = "(.*?) "', R1.text) [0] # Get the csrf token from the page # second request: Send the POST request to the login page with the initial cookie and TOKEN, data = {'commit ': 'sign in', 'utf8': 'authorization', 'authenticity _ token': authenticity_token, 'login': 'xxxxxx ', 'Password': 'xxxxx'} r2 = requests. post ('https: // github.com/session', data = data, cookies = rw.cookie) login_cookie = r2.cookies. get_dict () # Third Request: you can log on later with login_cookie. For example, you can access some personal configurations such as r3 = requests. get ('https: // github.com/settings/emails', cookies = login_cookie) print ('xxxx' in r3.text) # True
Automatically log on to github (process cookies by yourself)
Import requestsimport resession = requests. session () # first request r1 = session. get ('https: // github.com/login') authenticity_token = re. findall (r 'name = "authenticity_token ". *? Value = "(.*?) "', R1.text) [0] # Get the csrf token from the page # second request data = {'commit': 'sign in', 'utf8': 'authorization ', 'authenticity _ token': authenticity_token, 'login': 'xxx', 'Password': 'xxx'} r2 = session. post ('https: // github.com/session', data = data,) # The third request r3 = session. get ('https: // github.com/settings/emails') print ('xxxx' in r3.text) # True
Requests. session () automatically saves cookie supplement
Requests. post (url = 'xxxxxxxx', data = {'xxx': 'yyy'}) # No request header specified, # default request header: application/x-www-form-urlencoed # If the custom request header is application/json and the value is transmitted using data, the server cannot obtain the value requests. post (url = '', data = {'': 1 ,}, headers = {'content-type': 'application/json'}) requests. post (url = '', json = {'': 1 ,},) # default request header: application/json
View Code Response
Import requestsrespone = requests. get ('HTTP: // www.jianshu.com ') # respone attribute print (respone. text) print (respone. content) print (respone. status_code) print (respone. headers) print (respone. cookies) print (respone. cookies. get_dict () print (respone. cookies. items () print (respone. url) print (respone. history) print (respone. encoding) # Close: response. close () from contextlib import closingwith closing (requests. get ('xxx', stream = True) as response: for line in response. iter_content (): pass
Encoding Problems
# Encoding problem import requestsresponse = requests. get ('HTTP: // www.autohome.com/news') # response. encoding = 'gbk' # The page content returned by the home of the car is gb2312 encoded, and the default requests encoding is ISO-8859-1, if not set to gbk, the Chinese garbled print (response. text)
Obtain binary data
import requestsresponse=requests.get('https://timgsa.baidu.com/timg?image&quality=80&
size=b9999_10000&sec=1509868306530&di=712e4ef3ab258b36e9f4b48e85a81c9d&imgtype=0&
src=http%3A%2F%2Fc.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2F11385343fbf2b211e1fb58a1c08065380dd78e0c.jpg')with open('a.jpg','wb') as f: f.write(response.content)
# Stream parameter: set the value to 1.1. For example, when downloading a video of 100 GB, use response. it is unreasonable to write content to the file at once. import requestsresponse = requests. get ('https: // gss3.baidu.com/
6LZ0ej3k1Qd3ote6lo7D0j9wehsv/tieba-smallvideo-transcode/HSV ', stream = True) with open(' B .mp4', 'wb ') as f: for line in response. iter_content (): f. write (line)
Parse json
# Parse jsonimport requestsresponse = requests. get ('HTTP: // httpbin.org/get') import jsonres1 = json. loads (response. text) # too troublesome res2 = response. json () # directly obtain json data print (res1 = res2) # True
Redirection and History
Import requestsimport re # first request r1 = requests. get ('https: // github.com/login') r1_cookie = r1.cookies. get_dict () # Get the initial cookie (unauthorized) authenticity_token = re. findall (r 'name = "authenticity_token ". *? Value = "(.*?) "', R1.text) [0] # Get the csrf token from the page # second request: Send the POST request to the login page with the initial cookie and TOKEN, with the account password data = {'commit ': 'sign in', 'utf8': 'authorization', 'authenticity _ token': authenticity_token, 'login ': '1970 @ qq.com ', 'Password': 'alex3714'} # Test 1: If allow_redirects = False is not specified, the Location in the response header will jump to the new page, r2 indicates responser2 = requests. post ('https: // github.com/session', data = data, cookies = rw.cookie) print (r2.status _ code) #200 print (r2.url) # view the page print (r2.history) after the jump # view the responseprint (r2.history [0] before the jump. text) # See the response before the jump. text # Test 2: If allow_redirects is specified to be False, the response header will not jump to the new page even if Location appears. r2 indicates the responser2 = requests of the old page. post ('https: // github.com/session', data = data, cookies = rw.cookie, allow_redirects = False) print (r2.status _ code) #302 print (r2.url) # See the page https://github.com/sessionprint before jump (r2.history) # []
View Code advanced usage
1. SSL Cert Verification
# Certificate verification (most websites Use https) import requestsrespone = requests. get ('https: // www.12306.cn ') # if it is an ssl request, first check whether the certificate is valid. If the certificate is invalid, an error is reported. program terminal # Improvement 1: remove the error, however, a warning is reported for import requestsrespone = requests. get ('https: // www.12306.cn ', verify = False) # if the certificate is not verified, a warning is reported, and 200 print (respone. status_code) # Improvement 2: remove the error message and remove the alarm information import requestsfrom requests. packages import urllib3urllib3. disable_warnings () # disable warning respone = requests. get ('https: // www.12306.cn ', verify = False) print (respone. status_code) # IMPROVEMENT 3: Add a certificate # many websites Use https, but they can be accessed without a certificate, in most cases, they can be carried or not carry a certificate # zhihu \ Baidu and so on can be carried with or without # if there is a hard requirement, it must be carried, such as for targeted users, after obtaining the certificate, you have the permission to access a specific website import requestsrespone = requests. get ('https: // www.12306.cn ', cert = ('/path/server. crt ','/path/key') print (respone. status_code)
View Code
2. Use a proxy
# Official website link: Ghost (an ip address is a common issue) import requestsproxies = {'http': 'http: // egon: 123 @ localhost: 100 ', # proxy with username and password. The @ symbol is followed by the username and password 'http': 'http: // localhost: 8080', 'http': 'https: // localhost: 9743 ',} respone = requests. get ('https: // www.12306.cn ', proxies = proxies) print (respone. status_code) # supports socks proxy. install: pip install requests [socks] import requestsproxies = {'HTTP ': 'socks5: // user: pass @ host: port ', 'https': 's5 S5: // user: pass @ host: port'} respone = requests. get ('https: // www.12306.cn ', proxies = proxies) print (respone. status_code)
View Code
3. timeout settings
# Timeout setting # two types of timeout: float or tuple # timeout = 0.1 # indicates the timeout time for receiving data # timeout = (0.1, 0.2) #0.1 indicates link timeout 0.2 indicates the timeout time for receiving data import requestsrespone = requests. get ('https: // www.baidu.com ', timeout = 0.0001)
View Code
4. Authentication Settings
# Official website link: external (similar to alter). At this time, html cannot be obtained. # however, the essence is to splice it into a request header for sending # r. headers ['authorization'] = _ basic_auth_str (self. username, self. password) # generally, websites do not use the default encryption method. They are all written by ourselves. # We need to follow the website encryption method, write a method similar to _ basic_auth_str # obtain the encrypted string and add it to the Request Header # r. headers ['authorization'] = func ('..... ') # Check the default encryption method. Generally, websites do not use the default encryption settings import requestsfrom requests. auth import HTTPBasicAuthr = requests. get ('xxx', auth = HTTPBasicAuth ('user', 'Password') print (r. status_code) # HTTPBasicAuth can be abbreviated to the following format: import requestsr = requests. get ('xxx', auth = ('user', 'Password') print (r. status_code)
View Code
5. Exception Handling
# Exception handling import requestsfrom requests. exceptions import * # You can view requests. exceptions get exception type try: r = requests. get ('HTTP: // www.baidu.com ', timeout = 0.00001) response t ReadTimeout: print (' =: ') # response t ConnectionError: # network failure # print ('-----') # Timeout t Timeout: # print ('aaaaa') failed t RequestException: print ('error ')
View Code
6. upload files
import requestsfiles={'file':open('a.jpg','rb')}respone=requests.post('http://httpbin.org/post',files=files)print(respone.status_code)
View Code