Python implementation crawl http://www.cssmoban.com/cssthemes site template and download
Implementation code
#-*-Coding:utf-8-*-import urlparseimport urllib2import reimport os import os.pathurl= ' http://www.cssmoban.com/cssthe Mes ' #全局超时设置 urllib2.socket.setdefaulttimeout #依据url获取内容def geturlcontent (URL): response = Urllib2.urlopen (URL) h tml = Response.read (); Return html# gets the a tag in the HTML. And the format is <a target= "_blank" href= "/showcase/*" > Def getallurl (HTML): Return Re.findall (' <a[\\s]+href= '/ Cssthemes/\d+\.shtml ">.*?\/a> ', HTML) #获取下载文件的标题def getdowntitle (HTML): Return Re.findall (' \
\
\/a> ', HTML) #获取下一页的urldef GetNextURL (HTML): Return Re.findall (' <a.*?
Next page </a> ', html ' #下载文件def download (title,url): result = Urllib2.urlopen (URL). Read () If os.path.exists ("template/" ) ==false:os.makedirs ("template/") newname= ("template/" +title.decode (' Utf-8 ')) newname=newname+ '. ' +url[url.rfind ('. ') +1:len (URL)] Open (NewName, "WB"). Write (Result) #记录日志def I (msg): Fileobj=open (' Info.log ', ' a ') fileobj.write (msg+ ' \ n ') Fileobj.close (); Print msg# record error log def e (msg): Fileobj=open (' Error.log ', ' a ') fileobj.write (msg+ ' \ n ') fileobj.close (); Print Msgif __name__ = = ' __main__ ': #print getdownurl (' <a href= ' http://down.cssmoban.com/cssthemes1/cctp_17_ Jeans.zip "target=" _blank "class=" button Btn-down "title=" free download "><i class=" Icon-down icon-white "></i> <i class= "Icon-white icon-down-transiton" ></i> free download </a> ') html= geturlcontent (URL) I (' Start download:%s ') % (URL)) while True:lista= Getallurl (HTML); #print lista; Nextpage=getnexturl (HTML) #print nextpage[0] Nexturl= ' #i (' next page%s '% (nextPage)) If Len (nextPage) <=0:e (' Address:%s. No next page found, program exit '% (nextPage)) break; Nexturl=nextpage[0] nexturl=url+ '/' +nexturl[nexturl.index (' href= ') +6:nexturl.index (' "target ')] #print next Page for a in lista:downgotourl= ' try: #print a.decode (' Utf-8 ') Downgotourl= (url+ ' +a[a.index (' href= ') +6:a.index (' "> ')]) downgotourl=downgotourl.replace (URL, ' http:/ /www.cssmoban.com ') #print downgotourl downhtml=geturlcontent (downgotourl) #p Rint downhtml downtitlelist= getdowntitle (downhtml) downtitle= "If Len (downt itlelist) >0:downtitle=downtitlelist[0] #print downtitle downurllist= Getdownurl (downhtml) downurl= "If Len (downurllist) >0: Downurl=downurllist[0] downurl= downurl[downurl.index (' href= "') +6:downurl.index ('" Target ')] #print downurl I (' Start download:%s, file name:%s '% (downurl,downtitle)) Download (Downtitle,downurl) I ('%s ' download complete. Save file Name:%s '% (downurl,downtitle)) except Exception,e:e (' address:%s failed to download, failure message: '% (downgotourl) ') E (str (e)) I ('-----------------------------------------') I (' Run next page:%s '% (Nexturl)) Html= geturlcontent (Nexturl)
Incredibly powerful! Python crawls the template of the Cssmoban site and downloads