#Author: Mini
#! /usr/bin/env python
Import Urllib.request
Import Urllib.error
Import re
Data=urllib.request.urlopen ("http://news.sina.com.cn/"). Read ()
Data1=data.decode ("Utf-8", "ignore")
Pat= ' href= ' (http://news.sina.com.cn/.*?) " > '
Allurl=re.compile (PAT). FindAll (DATA1)
For I in range (0,len (allurl)):
Try
Print (str (i) + "\n\ntime")
Thisurl=allurl[i]
Fh= "e:/m/" +str (i) + ". html"
Urllib.request.urlretrieve (THISURL,FH)
Print ("success!")
Except Urllib.error.URLError as E:
If Hasattr (E, "code"):
Print (E.code)
If Hasattr (E, "Reason"):
Print (E.reason)
************************************
Import Urllib.request
Import re
Import Urllib.error
def use_proxy (url,proxy_addr):
proxy=urllib.request.proxyhandler ({"http":p roxy_addr})
Opener1=urllib.request.build_opener (Proxy,urllib.request.httphandler)
Urllib.request.install_opener (opener1)
proxy_addr= "220.161.37.21:8118"
url= "http://blog.csdn.net/"
headers= ("User-agent", "mozilla/5.0" (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/60.0.3112.90 safari/537.36query String parametersview sourceview URL E Ncoded ")
Opener=urllib.request.build_opener ()
Opener.addheaders=[headers]
Urllib.request.install_opener (opener)
Data=use_proxy (URL,PROXY_ADDR)
Data=urllib.request.urlopen (URL). read (). Decode ("Utf-8", "ignore")
Print (len (data))
pat= ' Res=re.compile (PAT). FindAll (data)
For I in range (0,len (res)):
Try
Fil= "e:/m/" +str (i) + ". html"
Urllib.request.urlretrieve (Res,filename=fil)
Print (str (i), "\n\ntime")
Except Urllib.error.URLError as E:
If Hasattr (E, "code"):
Print (E.code)
If Hasattr (E, "Reason"):
Print (E.reason)
Web crawling (PLUS5) news crawling and Proxy