When collecting web information, it is often necessary to forge a header to implement the acquisition script effectively.
Below, we will use Urllib2 's header to partially forge the header to achieve the acquisition of information
Method 1,
?
1234567891011121314151617181920212223242526272829 |
#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:urllib2-header.py
import urllib2
import sys
#抓取网页内容-发送报头-1
url
= "http://www.jb51.net"
send_headers
= {
‘Host‘
:
‘www.jb51.net‘
,
‘User-Agent‘
:
‘Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0‘
,
‘Accept‘
:
‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘
,
‘Connection‘
:
‘keep-alive‘
}
req
= urllib2.Request(url,headers
=
send_headers)
r
= urllib2.urlopen(req)
html
= r.read()
#返回网页内容
receive_header
= r.info()
#返回的报头信息
# sys.getfilesystemencoding()
html
= html.decode(
‘utf-8‘
,
‘replace‘
).encode(sys.getfilesystemencoding())
#转码:避免输出出现乱码
print receive_header
# print ‘####################################‘
print html
|
Method 2,
?
1234567891011121314151617181920212223 |
#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:urllib2-header.py
import urllib2
import sys
url
= ‘http://www.jb51.net‘
req
= urllib2.Request(url)
req.add_header(
‘Referer‘
,
‘http://www.jb51.net/‘
)
req.add_header(
‘User-Agent‘
,
‘Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0‘
)
r
= urllib2.urlopen(req)
html
= r.read()
receive_header
= r.info()
html
= html.decode(
‘utf-8‘
).encode(sys.getfilesystemencoding())
print receive_header
print ‘#####################################‘
print html
|
2 ways to forge an HTTP header using URLLIB2 in Python