Python captures images on a webpage and saves them to a local device,
In the previous article, I will share with you how to batch capture remote web page images from the PHP source code and save them to a local machine. If you are interested, click here to learn more.
#-*-Coding: UTF-8-*-import osimport uuidimport urllib2import cookielib ''' get the file extension ''' def get_file_extension (file): return OS. path. splitext (file) [1] ''' then create the file directory and return the directory ''' def mkdir (path): # Remove spaces on both sides of the path = path. strip () # Remove the tail \ symbol path = path. rstrip ("\") if not OS. path. exists (path): OS. makedirs (path) return path ''' automatically generates a unique string with a fixed length of 36 ''' def unique_str (): return str (uuid. uuid1 () ''' captures the webpage file content and saves it To the memory @ url to capture the file, Path + filename ''' def get_file (url): try: cj = cookielib. LWPCookieJar () opener = urllib2.build _ opener (urllib2.HTTPCookieProcessor (cj) urllib2.install _ opener (opener) req = urllib2.Request (url) operate = opener. open (req) data = operate. read () return data into T BaseException, e: print e return None ''' save the file to the local directory @ path local path @ file_name file name @ data file content ''' def save_file (path, file_name, data): if data = None: return mkdir (Path) if (not path. endswith ("/"): path = path + "/" file = open (path + file_name, "wb") file. write (data) file. flush () file. close () # obtain the file suffix print get_file_extension ("123.jpg"); # create a file directory and return this directory # print mkdir (" d:/ljq ") # automatically generate a unique string with a fixed length of 36 print unique_str () url = "http://qlogo1.store.qq.com/qzone/416501600/416501600/100? 0 "; save_file (" d:/ljq/"," 123.jpg", get_file (url ))
Use Python to capture images in a specified Url and save them to a local device.
# *** Encoding: UTF-8 *** _ author __= 'yangt' "fetch images from specific urlv1.0" import urllib, httplib, urlparse import re import random "judge url exists or not" def httpExists (url): host, path = urlparse. urlsplit (url) [] if ':' in host: # port specified, try to use it host, port = host. split (':', 1) try: port = int (port) Counter t ValueError: print 'invalid port number % R' % (port,) ret Urn False else: # no port specified, use default port = None try: connection = httplib. HTTPConnection (host, port = port) connection. request ("HEAD", path) resp = connection. getresponse () if resp. status = 200: # normal 'found' status found = True elif resp. status = 302: # recurse on temporary redirect found = httpExists (urlparse. urljoin (url, resp. getheader ('location', '') else: # everything e Lse-> not found print "Status % d % s: % s" % (resp. status, resp. reason, url) found = False failed t Exception, e: print e. _ class __, e, url found = False return found "get html src, return lines []" "def gGetHtmlLines (url): if url = None: return if not httpExists (url): return try: page = urllib. urlopen (url) html = page. readlines () page. close () return html parse t Exception, e: print "gGetHtmlLines () Error! Exception =>> "+ e return" "get html src, return string" "def gGetHtml (url): if url = None: return if not httpExists (url): return try: page = urllib. urlopen (url) html = page. read () page. close () return html failed t Exception, e: print "gGetHtml () error! Exception => "+ e return" Get file name based on url "def gGetFileName (url): if url = None: return None if url = "": return "" arr = url. split ("/") return arr [len (arr)-1] "generate random file names" def gRandFilename (type ): fname = ''for I in range (16): fname = fname + chr (random. randint (65,90) fname = fname + chr (random. randint (48, 57) return fname + '. '+ type "Get the absolute link" def gGetAbslLink (url, link) based on the url and link on it ): If url = None or link = None: return if url = ''or link = '': return url addr = ''if link [0] = '/': addr = gGetHttpAddr (url) + link elif len (link)> 3 and link [0: 4] = 'http': addr = link elif len (link)> 2 and link [0: 2] = '.. ': addr = gGetHttpAddrFatherAssign (url, link) else: addr = gGetHttpAddrFather (url) + link return addr "matches the regular expression based on the input lines, returns list "def gGetRegList (linesList, regx): if line SList = None: return rtnList = [] for line in linesList: matchs = re. search (regx, line, re. IGNORECASE) if matchs! = None: allGroups = matchs. groups () for foundStr in allGroups: if foundStr not in rtnList: rtnList. append (foundStr) return rtnList "downloads files based on URLs. Specify" def gDownloadWithFilename (url, savePath, file) as the file name parameter: # Check the parameters. Ignore try: urlopen = urllib. URLopener () fp = urlopen. open (url) data = fp. read () fp. close () file = open (savePath + file, 'W + B ') file. write (data) file. close () handle T IOError, error: print "DOWNLOAD % s ERR OR! ==>>> % S "% (url, error) failed t Exception, e: print" Exception ==>> "+ e" download files based on URLs, the file name is automatically obtained from the url "def gDownload (url, savePath): # parameter check. Currently, fileName = gGetFileName (url) # fileName = gRandFilename ('jpg ') is ignored ') gDownloadWithFilename (url, savePath, fileName) "download the jpg" def gDownloadHtmlJpg (downloadUrl, savePath): lines = gGetHtmlLines (downloadUrl) of a webpage based on the url of a webpage) # 'get the page source' regx = r "src \ s * = "? (\ S + )\. jpg "lists = gGetRegList (lines, regx) # 'get the links which match regular express 'if lists = None: return for jpg in lists: jpg = gGetAbslLink (downloadUrl, jpg) + '.jpg 'gDownload (jpg, savePath) print gGetFileName (jpg) "" Get the origin site address based on the url "def gGetHttpAddr (url ): if url = '': return'' arr = url. split ("/") return arr [0] + "//" + arr [2] "retrieve the parent directory based on the url" def gGetHttpAddrFather (url ): if url = '': return' 'Arr = url. split ("/") addr = arr [0] + '//' + arr [2] + '/' if len (arr)-1> 3: for I in range (3, len (arr)-1 ): addr = addr + arr [I] + '/'Return addr "Get the absolute link address of the link based on the url and the link of the upper level" def gGetHttpAddrFatherAssign (url, link ): if url = '': return'' if link = '': return'' linkArray = link. split ("/") urlArray = url. split ("/") partLink = ''partUrl ='' for I in range (len (linkArray): if linkArray [I] = '.. ': numOfFather = I + 1 # upper level else: partLink = partLink + '/' + linkArray [I] for I in range (len (urlArray)-1-numOfFather ): partUrl = partUrl + urlArray [I] if I <len (urlArray)-1-numOfFather-1: partUrl = partUrl + '/' return partUrl + partLink "" Get related htm and html links based on the url, and return list "def gGetHtmlLink (url ): # parameter check. rtnList = [] lines = gGetHtmlLines (url) regx = r "" href = "? (\ S + )\. htm "for link in gGetRegList (lines, regx): link = gGetAbslLink (url, link) + '.htm' if link not in rtnList: rtnList. append (link) print link return rtnList "" based on the url, capture the jpg on it and the jpg on its link htm "def gDownloadAllJpg (url, savePath ): # parameter check. gDownloadHtmlJpg (url, savePath) is ignored. # capture jpg links = gGetHtmlLink (url) for link in links: gDownloadHtmlJpg (link, savePath) on the link) "test" def main (): u = 'HTTP: // site.douban.com/196738/room/2462453/'save = '/root/python/tmp/' # print 'Download pic from ['+ u +'] 'print 'save to [' + 'save + ']... 'gdownloadhtmljpg (u, save) print "download finished" if _ name _ = "_ main _": main () else: print "called from intern."
The above code is a small Editor to introduce you to the python web page to capture all the images and save them to the local content, I hope you like it.
Articles you may be interested in:
- Python captures web images and places them in a specified folder
- Python web image capture example (python crawler)
- Use Python3 to write a script for capturing webpages and only webpage Images