總結下,Python 下載網頁的幾種方法
1
fd = urllib2.urlopen(url_link)
data = fd.read()
這是最簡潔的一種,當然也是Get的方法
2
通過GET的方法
def GetHtmlSource(url):
try:
htmSource = ''
req = urllib2.Request(url)
fd = urllib2.urlopen(req,"")
while 1:
data = fd.read(1024)
if not len(data):
break
htmSource += data
fd.close()
del fd
del req
htmSource = htmSource.decode('cp936')
htmSource = formatStr(htmSource)
return htmSource
except socket.error, err:
str_err = "%s" % err
return ""
3
通過GET的方法
def GetHtmlSource_Get(htmurl):
htmSource = ""
try:
urlx = httplib.urlsplit(htmurl)
conn = httplib.HTTPConnection(urlx.netloc)
conn.connect()
conn.putrequest("GET", htmurl, None)
conn.putheader("Content-Length", 0)
conn.putheader("Connection", "close")
conn.endheaders()
res = conn.getresponse()
htmSource = res.read()
except Exception(), err:
trackback.print_exec()
conn.close()
return htmSource
通過POST的方法
def GetHtmlSource_Post(getString):
htmSource = ""
try:
url = httplib.urlsplit("http://app.sipo.gov.cn:8080")
conn = httplib.HTTPConnection(url.netloc)
conn.connect()
conn.putrequest("POST", "/sipo/zljs/hyjs-jieguo.jsp")
conn.putheader("Content-Length", len(getString))
conn.putheader("Content-Type", "application/x-www-form-urlencoded")
conn.putheader("Connection", " Keep-Alive")
conn.endheaders()
conn.send(getString)
f = conn.getresponse()
if not f:
raise socket.error, "timed out"
htmSource = f.read()
f.close()
conn.close()
return htmSource
except Exception(), err:
trackback.print_exec()
conn.close()
return htmSource