Ways to get HTML "one": Using Urllib
#-*-Coding:utf-8-*-
Import Urllib
' get Web page content and return '
def getwebpagecontent (URL):
f = urllib.urlopen (URL)
data = F.read ()
f.close ()
Return Data
url = ' http://blog.csdn.net '
content = getwebpagecontent (URL)
Print Content
Ways to get HTML "two": Using Pycurl
# Pycurl Reference Address: http://pycurl.sourceforge.net/
# pycurl Download Address: http://pycurl.sourceforge.net/download/pycurl-7.18.1.tar.gz
#-*-coding:utf-8-*-
Importpycurl
Importstringio
defgeturlcontent_pycurl (URL):
C = Pycurl. Curl ()
c.setopt (Pycurl. Url,url)
B = Stringio.stringio ()
c.setopt (Pycurl. Writefunction, B.write)
c.setopt (Pycurl. Followlocation, 1)
c.setopt (Pycurl. Maxredirs, 5)
# Agent
#c. setopt (Pycurl. PROXY, ' http://11.11.11.11:8080 ')
#c. setopt (Pycurl. Proxyuserpwd, ' aaa:aaa ')
C.perform ()
Returnb.getvalue ()
url = ' http://blog.csdn.net '
content =geturlcontent_pycurl (URL)
Print Content
Ways to get HTML "three": Using Cpamie
# Cpamie Download: http://sourceforge.net/project/showfiles.php?group_id=103662
#-*-coding:utf-8-*-
Import Cpamie
defgeturlcontent_cpamie (URL):
g_ie =cpamie.pamie ()
g_ie.showdebugging = False
g_ie.framename= None
g_ie.navigate (URL)
content =g_ie.pagegettext ()
g_ie.quit ()
returncontent
url = ' http://blog.csdn.net '
content = Geturlcontent_cpamie (URL)
Print Content
Ways to get HTML "four": Download files using Urllib
#-*-Coding:utf-8-*-
Import Urllib
url = ' http://blog.csdn.net '
path = ' c://temp//csdn.net.html '
Urllib.urlretrieve (Url,path)
Ways to get HTML "four": using the twisted framework of Client.getpage
# Twisted Frame Download:
Http://tmrc.mit.edu/mirror/twisted/Twisted/8.1/Twisted_NoDocs-8.1.0.win32-py2.5.exe
#-*-coding:utf-8-*-
fromtwisted.internet Import Reactor
fromtwisted.web Import Client
Defresult (content):
Print Content
reactor.stop ()
deferred =client.getpage ("Http://blog.csdn.net")
deferred.addcallback (Result)
Reactor.run ()