This article mainly introduces the python web page capture example (python crawler). For more information, see the following code:
#-*-Encoding: UTF-8 -*-
'''
Created on 2014-4-24
@ Author: Leon Wong
'''
Import urllib2
Import urllib
Import re
Import time
Import OS
Import uuid
# Obtain the url of the second-level page
Def findUrl2 (html ):
Re1 = r 'HTTP: // tuchong.com/mongod#/mongod#/ | http: // \ w + (? Url2list = re. findall (re1, html)
Url2lstfltr = list (set (url2list ))
Url2lstfltr. sort (key = url2list. index)
# Print url2lstfltr
Return url2lstfltr
# Retrieving html text
Def getHtml (url ):
Html = urllib2.urlopen (url). read (). decode ('utf-8') # decodes UTF-8
Return html
# Download an image to a local device
Def download (html_page, pageNo ):
# Define the folder name
X = time. localtime (time. time ())
Foldername = str (x. _ getattribute _ ("tm_year") + "-" + str (x. _ getattribute _ ("tm_mon") + "-" + str (x. _ getattribute _ ("tm_mday "))
Re2 = r 'HTTP: // photos.tuchong.com/.#/f/.##.jpg'
Imglist = re. findall (re2, html_page)
Print imglist
Download_img = None
For imgurl in imglist:
Picpath = 'd: \ TuChong \ % s \ % s' % (foldername, str (pageNo ))
Filename = str (uuid. uuid1 ())
If not OS. path. exists (picpath ):
OS. makedirs (picpath)
Target = picpath + "\ % s.jpg" % filename
Print "The photos location is:" + target
Download_img = urllib. urlretrieve (imgurl, target) # download the image to the specified path
Time. sleep (1)
Print (imgurl)
Return download_img
# Def callback (blocknum, blocksize, totalsize ):
# ''' Callback function
# @ Blocknum: The downloaded data block
# @ Blocksize: size of the data block
# @ Totalsize: the size of the Remote File
#'''
# Print str (blocknum), str (blocksize), str (totalsize)
# If blocknum * blocksize> = totalsize:
# Print 'Download completed'
Def quitit ():
Print "Bye! "
Exit (0)
If _ name _ = '_ main __':
Print '''************************************ *****
** Welcome to Spider for TUCHONG **
** Created on 2014-4-24 **
** @ Author: Leon Wong **
**************************************** *'''
PageNo = raw_input ("Input the page number you want to scratch (1-100), please input 'quit' if you want to quit> ")
While not pageNo. isdigit () or int (pageNo)> 100:
If pageNo = 'quit': quitit ()
Print "Param is invalid, please try again ."
PageNo = raw_input ("Input the page number you want to scratch> ")
# Crawling Based on the image and insect portrait Module
Html = getHtml ("http://tuchong.com/tags/%E4%BA%BA%E5%83%8F? Page = "+ str (pageNo ))
Detllst = findUrl2 (html)
For detail in detllst:
Html2 = getHtml (detail)
Download (html2, pageNo)
Print "Finished ."