#-*-Encoding:utf-8-*-
‘‘‘
Created on 2014-4-24
@author: Leon Wong
‘‘‘
Import Urllib2
Import Urllib
Import re
Import time
Import OS
Import UUID
#获取二级页面url
def findUrl2 (HTML):
Re1 = R ' http://tuchong.com/\d+/\d+/|http://\w+ (? <!photos). tuchong.com/\d+/'
Url2list = Re.findall (re1,html)
url2lstfltr = List (set (Url2list))
Url2lstfltr.sort (Key=url2list.index)
#print url2lstfltr
Return url2lstfltr
#获取html文本
def gethtml (URL):
html = urllib2.urlopen (URL). read (). Decode (' Utf-8 ') #解码为utf-8
return HTML
#下载图片到本地
def download (html_page, PageNo):
#定义文件夹的名字
x = Time.localtime (Time.time ())
foldername = str (x.__getattribute__ ("Tm_year")) + "-" +str (x.__getattribute__ ("Tm_mon")) + "-" +str (x.__getattribute__ ("Tm_mday"))
Re2=r ' http://photos.tuchong.com/.+/f/.+\.jpg '
Imglist=re.findall (Re2,html_page)
Print Imglist
Download_img=none
For Imgurl in Imglist:
Picpath = ' d:\\tuchong\\%s\\%s '% (Foldername,str (PageNo))
filename = str (UUID.UUID1 ())
If not os.path.exists (Picpath):
Os.makedirs (Picpath)
target = picpath+ "\\%s.jpg"% filename
Print "The photos location is:" +target
download_img = Urllib.urlretrieve (Imgurl, target) #将图片下载到指定路径中
Time.sleep (1)
Print (Imgurl)
Return download_img
# Def callback (Blocknum, BlockSize, totalsize):
# ' callback function
# @blocknum: Data blocks that have already been downloaded
# @blocksize: size of data blocks
# @totalsize: The size of the remote file
# ‘‘‘
# Print STR (blocknum), str (blocksize), str (totalsize)
# if Blocknum * blocksize >= totalsize:
# print ' Download complete '
Def quitit ():
Print "bye!"
Exit (0)
if __name__ = = ' __main__ ':
Print "' *****************************************
* * Welcome to Spider for Tuchong * *
* * Created on 2014-4-24 * *
* * @author: Leon Wong * *
‘‘‘
PageNo = raw_input ("Input the page number you want to scratch (1-100), please input ' quit ' if you want to quit>")
While not pageno.isdigit () or int (pageno) > 100:
if PageNo = = ' quit ': Quitit ()
Print "Param is invalid, try again."
PageNo = raw_input ("Input the page number you want to scratch >")
#针对图虫人像模块来爬取
html = gethtml ("http://tuchong.com/tags/%E4%BA%BA%E5%83%8F/?page=" +str (PageNo))
Detllst = FINDURL2 (HTML)
For detail in Detllst:
HTML2 = gethtml (detail)
Download (Html2,pageno)
Print "Finished."
Python crawls web Images