We can use Urllib to crawl remote data to save Oh, the following is Python3 crawl Web resources of a variety of methods, there is a need to reference.
1, the simplest
Import urllib.request
response = Urllib.request.urlopen (' http://python.org/')
html = Response.read ()
2. Use Request
Import urllib.request
req = urllib.request.Request (' http://python.org/')
response = Urllib.request.urlopen ( REQ)
the_page = Response.read ()
3, send data
#! /usr/bin/env python3
Import urllib.parse
import urllib.request
url = ' http://localhost/login.php '
User_agent = ' mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ': ' Login ',
' login[email] ': ' yzhang@i9i8.com ',
' login[password ': ' 123456 '
}
data = Urllib.parse.urlencode (values)
req = urllib.request.Request (URL, data)
req.add_ Header (' Referer ', ' http://www.python.org/')
response = Urllib.request.urlopen (req)
the_page = Response.read ()
Print (The_page.decode ("UTF8"))
4, send data and header
#! /usr/bin/env python3
Import urllib.parse
import urllib.request
url = ' http://localhost/login.php '
User_agent = ' mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ': ' Login ',
' login[email] ': ' yzhang@i9i8.com ',
' login[password ': ' 123456 '
}
headers = {' User-agent ': user_agent}
data = Urllib.parse.urlencode (values)
req = Urllib.request.Request (URL, data, headers)
response = Urllib.request.urlopen (req)
the_page = Response.read ()
Print (The_page.decode ("UTF8"))
5. HTTP Error
#! /usr/bin/env python3
Import urllib.request
req = urllib.request.Request (' http://www.jb51.net ')
try:
Urllib.request.urlopen (req)
except Urllib.error.HTTPError as E:
print (E.code)
print (E.read () . Decode ("UTF8"))
6, exception handling 1
#! /usr/bin/env Python3 from
urllib.request import request, Urlopen from
urllib.error import Urlerror, httperror< C3/>req = Request ("http://www.jb51.net/")
try:
response = Urlopen (req)
except Httperror as E:
print (' The server couldn ' t fulfill the request. ')
Print (' Error code: ', E.code)
except Urlerror as E:
print (' We failed to reach a server. ')
Print (' Reason: ', E.reason)
else:
print ("good!")
Print (Response.read (). Decode ("UTF8"))
7, exception handling 2
#! /usr/bin/env Python3
from urllib.request import request, Urlopen from
urllib.error import urlerror
req = Request ("http://www.jb51.net/")
try:
response = Urlopen (req)
except Urlerror as e:
if Hasattr (E, ' Reason '):
print (' We failed to reach a server. ')
Print (' Reason: ', E.reason)
elif hasattr (E, ' Code '):
print (' The server couldn ' t fulfill the request. '
) Print (' Error code: ', E.code)
else:
print ("good!")
Print (Response.read (). Decode ("UTF8"))
8, HTTP authentication
#! /usr/bin/env python3
Import urllib.request
# Create a password manager
password_mgr = Urllib.request.HTTPPasswordMgrWithDefaultRealm ()
# ADD the username and password.
# If We knew the realm, we could use it instead of None.
Top_level_url = "https://www.jb51.net/"
Password_mgr.add_password (None, Top_level_url, ' Rekfan ', ' xxxxxx ')
Handler = Urllib.request.HTTPBasicAuthHandler (password_mgr)
# Create "opener" (Openerdirector instance)
Opener = Urllib.request.build_opener (handler)
# Use the opener to fetch a URL
A_url = "https://www.jb51.net/"
x = Opener.open (a_url)
print (X.read ())
# Install the opener.
# now all calls to Urllib.request.urlopen with our opener.
Urllib.request.install_opener (opener)
a = Urllib.request.urlopen (A_url). Read (). Decode (' UTF8 ')
print (a)
9, the use of agents
#! /usr/bin/env python3
Import urllib.request
proxy_support = Urllib.request.ProxyHandler ({' sock5 ': ' localhost : 1080 '})
opener = Urllib.request.build_opener (proxy_support)
Urllib.request.install_opener (opener)
a = Urllib.request.urlopen ("Http://www.jb51.net"). Read (). Decode ("UTF8")
print (a)
10, timeout
#! /usr/bin/env python3
Import socket
import urllib.request
# Timeout in seconds
timeout = 2
Socket.setdefaulttimeout (Timeout) # This call to
Urllib.request.urlopen now uses the default timeout
# we have s ET in the socket module
req = urllib.request.Request (' http://www.jb51.net/')
a = Urllib.request.urlopen (req) . Read ()
print (a)
Summarize
This is the full content of this article, I hope the content of this article for you to learn or use Python can help, if you have questions you can message exchange.