Import re
Import Urllib.request
Import Urllib.error
Url= "Http://blog.csdn.net"
Header= ("User-agent", ' user-agent:mozilla/5.0 (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/55.0.2883.87 safari/537.36 ')
Opn=urllib.request.build_opener ()
Opn.addheaders=[header]
Data=opn.open (URL). read (). Decode ()
pat= ' <li class= "" ><a href= "(. *?)" > '
Menu_data=re.compile (PAT). FindAll (data)
File_num=0
For All_link in Menu_data:
Data1=opn.open (' http://blog.csdn.net/' +all_link). Read (). Decode ()
pat1= ' <a href= ' (http://blog.csdn.net/.*[0-9].*?) "Target=.*"
Sub_menu=re.compile (PAT1). FindAll (DATA1)
Try
For link in Sub_menu:
File_num+=1
Urllib.request.urlretrieve (link, "d:\\data\\" +str (file_num) + ". html")
Except Urllib.error.URLError as err:
If Hasattr (Err, "code"):
Print (Err.code)
If Hasattr (err, "Reason"):
Print (Err.reason)
Python crawler CSDN Web page download