Python crawls web Images

Source: Internet
Author: User


#-*-Encoding:utf-8-*-
‘‘‘
Created on 2014-4-24

@author: Leon Wong
‘‘‘

Import Urllib2
Import Urllib
Import re
Import time
Import OS
Import UUID

#获取二级页面url
def findUrl2 (HTML):
Re1 = R ' http://tuchong.com/\d+/\d+/|http://\w+ (? <!photos). tuchong.com/\d+/'
Url2list = Re.findall (re1,html)
url2lstfltr = List (set (Url2list))
Url2lstfltr.sort (Key=url2list.index)
#print url2lstfltr
Return url2lstfltr

#获取html文本
def gethtml (URL):
html = urllib2.urlopen (URL). read (). Decode (' Utf-8 ') #解码为utf-8
return HTML

#下载图片到本地
def download (html_page, PageNo):
#定义文件夹的名字
x = Time.localtime (Time.time ())
foldername = str (x.__getattribute__ ("Tm_year")) + "-" +str (x.__getattribute__ ("Tm_mon")) + "-" +str (x.__getattribute__ ("Tm_mday"))
Re2=r ' http://photos.tuchong.com/.+/f/.+\.jpg '
Imglist=re.findall (Re2,html_page)
Print Imglist
Download_img=none
For Imgurl in Imglist:
Picpath = ' d:\\tuchong\\%s\\%s '% (Foldername,str (PageNo))
filename = str (UUID.UUID1 ())
If not os.path.exists (Picpath):
Os.makedirs (Picpath)
target = picpath+ "\\%s.jpg"% filename
Print "The photos location is:" +target
download_img = Urllib.urlretrieve (Imgurl, target) #将图片下载到指定路径中
Time.sleep (1)
Print (Imgurl)
Return download_img


# Def callback (Blocknum, BlockSize, totalsize):
# ' callback function
# @blocknum: Data blocks that have already been downloaded
# @blocksize: size of data blocks
# @totalsize: The size of the remote file
# ‘‘‘
# Print STR (blocknum), str (blocksize), str (totalsize)
# if Blocknum * blocksize >= totalsize:
# print ' Download complete '

Def quitit ():
Print "bye!"
Exit (0)

if __name__ = = ' __main__ ':
Print "' *****************************************
* * Welcome to Spider for Tuchong * *
* * Created on 2014-4-24 * *
* * @author: Leon Wong * *
‘‘‘
PageNo = raw_input ("Input the page number you want to scratch (1-100), please input ' quit ' if you want to quit>")
While not pageno.isdigit () or int (pageno) > 100:
if PageNo = = ' quit ': Quitit ()
Print "Param is invalid, try again."
PageNo = raw_input ("Input the page number you want to scratch >")

#针对图虫人像模块来爬取
html = gethtml ("http://tuchong.com/tags/%E4%BA%BA%E5%83%8F/?page=" +str (PageNo))

Detllst = FINDURL2 (HTML)
For detail in Detllst:
HTML2 = gethtml (detail)
Download (Html2,pageno)
Print "Finished."

Python crawls web Images

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.