This article mainly introduces the Python implementation of a simple Web page image capture complete code example, has a certain value for reference, the need for friends can refer to.
The steps for capturing a network image using Python are:
1, according to the given URL to obtain the Web page source code
2, using regular expressions to filter out the image address in the source code
3, according to filter out the picture address to download the network picture
The following is a relatively simple crawl of a certain Baidu paste page image of the implementation:
#-*-Coding:utf-8-*-# Feimengjuan import re import urllib import urllib2 #抓取网页图片 #根据给定的网址来获取网页详细信息, the resulting HTML is the source code of the Web page def gethtml (URL): page = urllib.urlopen (URL) html = page.read () return HTML def getimg (html): # Use regular expressions to filter out the image address in the source code reg = R ' src= "(. +?\.jpg)" Pic_ext ' Imgre = Re.compile (reg) imglist = Imgre.findall ( HTML) #表示在整个网页中过滤出所有图片的地址, placed in the imglist x = 0 for imgurl in imglist: urllib.urlretrieve (Imgurl, '%s.jpg '%x ) #打开imglist中保存的图片网址 and download the picture saved in local x = x + 1 html = gethtml ("http://tieba.baidu.com/p/2460150866") #获取该网址网页详细信息 , the resulting HTML is the source code of the Web page getimg (HTML) #从网页源代码中分析并下载保存图片
The code is further organized to create a "picture" folder locally to save the picture
#-*-Coding:utf-8-*-# Feimengjuan import re import urllib import urllib2 import os #抓取网页图片 #根据给定的网址来获取网页详细信息, the resulting HTML is is the source code of the webpage def gethtml (URL): page = urllib.urlopen (URL) html = page.read () return HTML #创建保存图片的文件夹 def mkdir (path): Path = Path.strip () # To determine if the path exists # exists True # does not exist flase isexists = os.path.exists (path) if not isexists:print U ' new name ', Path,u ' folder ' # Create directory operation function os.makedirs (path) return True else: # If directory exists then do not create and prompt directory already exists print U ' path,u ' folder has been created successfully ' return False # Enter file name, save multiple pictures def saveimages (imglist,name): number = 1 for ImageURL in img List:splitpath = Imageurl.split ('. ') Ftail = Splitpath.pop () If Len (ftail) > 3:ftail = ' jpg ' fileName = name + "/" + str (number) + "." + Ftai L # for each picture address, save Try:u = Urllib2.urlopen (imageURL) data = U.read () F = open (FileName, ' wb+ ') F.write (data) print U ' is saving a picture for ', FileName f.close () except URLLIB2. Urlerror asE:print (e.reason) Number + = 1 #获取网页中所有图片的地址 def getallimg (HTML): #利用正则表达式把源代码中的图片地址过滤出来 reg = R ' src= "(. +? \.jpg) "Pic_ext ' Imgre = Re.compile (reg) imglist = Imgre.findall (HTML) #表示在整个网页中过滤出所有图片的地址, put in Imglist return imglist #创建本地保存文件夹 and download Save picture if __name__ = = ' __main__ ': html = gethtml ("http://tieba.baidu.com/p/2460150866") #获取该网址网页详细信息, get HT ML is the source code of the webpage path = u ' picture ' mkdir (path) #创建本地文件夹 imglist = getallimg (HTML) #获取图片的地址列表 saveimages (imglist,path) # Save Picture
Results Dozens of images are saved under the Pictures folder, such as: