Python web image capture example (python crawler)

Source: Internet
Author: User
This article mainly introduces the python web page capture example (python crawler). For more information, see the following code:


#-*-Encoding: UTF-8 -*-
'''
Created on 2014-4-24

@ Author: Leon Wong
'''

Import urllib2
Import urllib
Import re
Import time
Import OS
Import uuid

# Obtain the url of the second-level page
Def findUrl2 (html ):
Re1 = r 'HTTP: // tuchong.com/mongod#/mongod#/ | http: // \ w + (? Url2list = re. findall (re1, html)
Url2lstfltr = list (set (url2list ))
Url2lstfltr. sort (key = url2list. index)
# Print url2lstfltr
Return url2lstfltr

# Retrieving html text
Def getHtml (url ):
Html = urllib2.urlopen (url). read (). decode ('utf-8') # decodes UTF-8
Return html

# Download an image to a local device
Def download (html_page, pageNo ):
# Define the folder name
X = time. localtime (time. time ())
Foldername = str (x. _ getattribute _ ("tm_year") + "-" + str (x. _ getattribute _ ("tm_mon") + "-" + str (x. _ getattribute _ ("tm_mday "))
Re2 = r 'HTTP: // photos.tuchong.com/.#/f/.##.jpg'
Imglist = re. findall (re2, html_page)
Print imglist
Download_img = None
For imgurl in imglist:
Picpath = 'd: \ TuChong \ % s \ % s' % (foldername, str (pageNo ))
Filename = str (uuid. uuid1 ())
If not OS. path. exists (picpath ):
OS. makedirs (picpath)
Target = picpath + "\ % s.jpg" % filename
Print "The photos location is:" + target
Download_img = urllib. urlretrieve (imgurl, target) # download the image to the specified path
Time. sleep (1)
Print (imgurl)
Return download_img


# Def callback (blocknum, blocksize, totalsize ):
# ''' Callback function
# @ Blocknum: The downloaded data block
# @ Blocksize: size of the data block
# @ Totalsize: the size of the Remote File
#'''
# Print str (blocknum), str (blocksize), str (totalsize)
# If blocknum * blocksize> = totalsize:
# Print 'Download completed'

Def quitit ():
Print "Bye! "
Exit (0)

If _ name _ = '_ main __':
Print '''************************************ *****
** Welcome to Spider for TUCHONG **
** Created on 2014-4-24 **
** @ Author: Leon Wong **
**************************************** *'''
PageNo = raw_input ("Input the page number you want to scratch (1-100), please input 'quit' if you want to quit> ")
While not pageNo. isdigit () or int (pageNo)> 100:
If pageNo = 'quit': quitit ()
Print "Param is invalid, please try again ."
PageNo = raw_input ("Input the page number you want to scratch> ")

# Crawling Based on the image and insect portrait Module
Html = getHtml ("http://tuchong.com/tags/%E4%BA%BA%E5%83%8F? Page = "+ str (pageNo ))

Detllst = findUrl2 (html)
For detail in detllst:
Html2 = getHtml (detail)
Download (html2, pageNo)
Print "Finished ."

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.