python3 N ways to crawl Web resources
1, the simplest
Import Urllib.request
Response = Urllib.request.urlopen (' http://python.org/')
html = Response.read ()
2. Use Request
Import Urllib.request
req = urllib.request.Request (' http://python.org/')
Response = Urllib.request.urlopen (req)
The_page = Response.read ()
3. Send data
#! /usr/bin/env Python3
Import Urllib.parse
Import Urllib.request
url = ' http://localhost/login.php '
User_agent = ' mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
Values = {
' Act ': ' Login ',
' Login[email] ': ' [email protected] ',
' Login[password] ': ' 123456 '
}
data = Urllib.parse.urlencode (values)
req = urllib.request.Request (URL, data)
Req.add_header (' Referer ', ' http://www.python.org/')
Response = Urllib.request.urlopen (req)
The_page = Response.read ()
Print (The_page.decode ("UTF8"))
4. Send data and headers
#! /usr/bin/env Python3
Import Urllib.parse
Import Urllib.request
url = ' http://localhost/login.php '
User_agent = ' mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
Values = {
' Act ': ' Login ',
' Login[email] ': ' [email protected] ',
' Login[password] ': ' 123456 '
}
headers = {' User-agent ': user_agent}
data = Urllib.parse.urlencode (values)
req = urllib.request.Request (URL, data, headers)
Response = Urllib.request.urlopen (req)
The_page = Response.read ()
Print (The_page.decode ("UTF8"))
5. HTTP Error
#! /usr/bin/env Python3
Import Urllib.request
req = urllib.request.Request (' http://www.111cn.net ')
Try
Urllib.request.urlopen (req)
Except Urllib.error.HTTPError as E:
Print (E.code)
Print (E.read (). Decode ("UTF8"))
6. Exception Handling 1
#! /usr/bin/env Python3
From urllib.request import request, Urlopen
From Urllib.error import Urlerror, Httperror
req = Request ("http://www.111cn.net/")
Try
Response = Urlopen (req)
Except Httperror as E:
Print (' The server couldn ' t fulfill the request. ')
Print (' Error code: ', E.code)
Except Urlerror as E:
Print (' We failed to reach a server. ')
Print (' Reason: ', E.reason)
Else
Print ("good!")
Print (Response.read (). Decode ("UTF8"))
7. Exception Handling 2
#! /usr/bin/env Python3
From urllib.request import request, Urlopen
From Urllib.error import Urlerror
req = Request ("http://www.111cn.net/")
Try
Response = Urlopen (req)
Except Urlerror as E:
If Hasattr (E, ' reason '):
Print (' We failed to reach a server. ')
Print (' Reason: ', E.reason)
Elif hasattr (E, ' Code '):
Print (' The server couldn ' t fulfill the request. ')
Print (' Error code: ', E.code)
Else
Print ("good!")
Print (Response.read (). Decode ("UTF8"))
8. HTTP Authentication
#! /usr/bin/env Python3
Import Urllib.request
# Create a password manager
Password_mgr = Urllib.request.HTTPPasswordMgrWithDefaultRealm ()
# ADD the username and password.
# If We knew the realm, we could use it instead of None.
Top_level_url = "Https://www.111cn.net/"
Password_mgr.add_password (None, Top_level_url, ' Rekfan ', ' xxxxxx ')
Handler = Urllib.request.HTTPBasicAuthHandler (password_mgr)
# create "opener" (Openerdirector instance)
Opener = Urllib.request.build_opener (handler)
# Use the opener to fetch a URL
A_url = "Https://www.111cn.net/"
x = Opener.open (A_url)
Print (X.read ())
# Install the opener.
# now all calls to Urllib.request.urlopen with our opener.
Urllib.request.install_opener (opener)
A = Urllib.request.urlopen (A_url). Read (). Decode (' UTF8 ')
Print (a)
9, the use of agents
#! /usr/bin/env Python3
Import Urllib.request
Proxy_support = Urllib.request.ProxyHandler ({' Sock5 ': ' localhost:1080 '})
Opener = Urllib.request.build_opener (Proxy_support)
Urllib.request.install_opener (opener)
A = Urllib.request.urlopen ("Http://www.111cn.net"). Read (). Decode ("UTF8")
Print (a)
10. Timeout
#! /usr/bin/env Python3
Import socket
Import Urllib.request
# Timeout in seconds
Timeout = 2
Socket.setdefaulttimeout (Timeout)
# Urllib.request.urlopen now uses the default timeout
# We have a set in the socket module
req = Urllib.request.Request (' http://www.111cn.net/')
A = Urllib.request.urlopen (req). Read ()
Print (a)
Python3 Urllib Detailed Usage method (header, proxy, timeout, authentication, exception handling)