擷取html的方法【一】:使用urllib
# -*- coding: UTF-8 -*-
import urllib
' 擷取web頁面內容並返回'
def getWebPageContent(url):
f = urllib.urlopen(url)
data = f.read()
f.close()
return data
url = 'http://blog.csdn.net'
content = getWebPageContent(url)
print content
擷取html的方法【二】:使用Pycurl
# Pycurl參考地址:http://pycurl.sourceforge.net/
# Pycurl:http://pycurl.sourceforge.net/download/pycurl-7.18.1.tar.gz
# -*-coding: UTF-8 -*-
importpycurl
importStringIO
defgetURLContent_pycurl(url):
c = pycurl.Curl()
c.setopt(pycurl.URL,url)
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
# 代理
#c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
#c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
c.perform()
returnb.getvalue()
url = 'http://blog.csdn.net'
content =getURLContent_pycurl(url)
print content
擷取html的方法【三】:使用cPAMIE
# cPAMIE下載:http://sourceforge.net/project/showfiles.php?group_id=103662
# -*-coding: UTF-8 -*-
import cPAMIE
defgetURLContent_cPAMIE(url):
g_ie =cPAMIE.PAMIE()
g_ie.showDebugging = False
g_ie.frameName= None
g_ie.navigate(url)
content =g_ie.pageGetText()
g_ie.quit()
returncontent
url = 'http://blog.csdn.net'
content = getURLContent_cPAMIE(url)
print content
擷取html的方法【四】:使用urllib下載檔案
# -*- coding: UTF-8 -*-
import urllib
url = 'http://blog.csdn.net'
path = 'C://temp//csdn.net.html'
urllib.urlretrieve(url,path)
擷取html的方法【四】:利用Twisted架構之client.getPage
# Twisted架構下載:
http://tmrc.mit.edu/mirror/twisted/Twisted/8.1/Twisted_NoDocs-8.1.0.win32-py2.5.exe
# -*-coding: UTF-8 -*-
fromtwisted.internet import reactor
fromtwisted.web import client
defresult(content):
print content
reactor.stop()
deferred =client.getPage("http://blog.csdn.net")
deferred.addCallback(result)
reactor.run()