Python captures images from webpages and saves them to a local device-Python tutorial

Source: Internet
Author: User
This article introduces how to capture and save images on a Web page by using python, if you are interested in python web page image capturing, learn it. in the previous article, I will share with you the PHP source code for batch capturing remote Web page images and saving them to a local machine, if you are interested, click to learn more.

#-*-Coding: UTF-8-*-import osimport uuidimport urllib2import cookielib ''' get the file extension ''' def get_file_extension (file): return OS. path. splitext (file) [1] ''' then create the file directory and return the directory ''' def mkdir (path): # Remove spaces on both sides of the path = path. strip () # Remove the tail \ symbol path = path. rstrip ("\") if not OS. path. exists (path): OS. makedirs (path) return path ''' automatically generates a unique string with a fixed length of 36 ''' def unique_str (): return str (uuid. uuid1 () ''' captures the webpage file content and saves it to the memory @ url to capture the file. path + filename ''' def get_file (url): try: cj = cookielib. LWPCookieJar () opener = urllib2.build _ opener (urllib2.HTTPCookieProcessor (cj) urllib2.install _ opener (opener) req = urllib2.Request (url) operate = opener. open (req) data = operate. read () return data into T BaseException, e: print e return None ''' save the file to the local Directory @ path local path @ file_name file name @ data file content ''' def save_file (path, file_name, data): if data = None: return mkdir (path) if (not path. endswith ("/"): path = path + "/" file = open (path + file_name, "wb") file. write (data) file. flush () file. close () # obtain the file suffix print get_file_extension ("123.jpg"); # Create a file directory and return this directory # print mkdir (" d:/ljq ") # automatically generate a unique string with a fixed length of 36 print unique_str () url =" http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0 "; Save_file (" d:/ljq/"," 123.jpg", get_file (url ))

Use Python to capture images in a specified Url and save them to a local device.

# *** Encoding: UTF-8 *** _ author __= 'yangt' "fetch images from specific urlv1.0" import urllib, httplib, urlparse import re import random "judge url exists or not" def httpExists (url): host, path = urlparse. urlsplit (url) [] if ':' in host: # port specified, try to use it host, port = host. split (':', 1) try: port = int (port) counter t ValueError: print 'invalid port number % R' % (port,) ret Urn False else: # no port specified, use default port = None try: connection = httplib. HTTPConnection (host, port = port) connection. request ("HEAD", path) resp = connection. getresponse () if resp. status = 200: # normal 'found' status found = True elif resp. status = 302: # recurse on temporary redirect found = httpExists (urlparse. urljoin (url, resp. getheader ('location', '') else: # everything e Lse-> not found print "Status % d % s: % s" % (resp. status, resp. reason, url) found = False failed t Exception, e: print e. _ class __, e, url found = False return found "get html src, return lines []" "def gGetHtmlLines (url): if url = None: return if not httpExists (url): return try: page = urllib. urlopen (url) html = page. readlines () page. close () return html parse T Exception, e: print "gGetHtmlLines () Error! Exception =>> "+ e return" "get html src, return string" "def gGetHtml (url): if url = None: return if not httpExists (url): return try: page = urllib. urlopen (url) html = page. read () page. close () return html failed t Exception, e: print "gGetHtml () error! Exception => "+ e return" Get file name based on url "def gGetFileName (url): if url = None: return None if url = "": return "" arr = url. split ("/") return arr [len (arr)-1] "generate random file names" def gRandFilename (type ): fname = ''for I in range (16): fname = fname + chr (random. randint (65,90) fname = fname + chr (random. randint (48, 57) return fname + '. '+ type "get the absolute link" def gGetAbslLink (url, link) based on the url and link on it ): If url = None or link = None: return if url = ''or link = '': return url addr = ''if link [0] = '/': addr = gGetHttpAddr (url) + link elif len (link)> 3 and link [0: 4] = 'http': addr = link elif len (link)> 2 and link [0: 2] = '.. ': addr = gGetHttpAddrFatherAssign (url, link) else: addr = gGetHttpAddrFather (url) + link return addr "matches the regular expression based on the input lines, returns list "def gGetRegList (linesList, regx): if line SList = None: return rtnList = [] for line in linesList: matchs = re. search (regx, line, re. IGNORECASE) if matchs! = None: allGroups = matchs. groups () for foundStr in allGroups: if foundStr not in rtnList: rtnList. append (foundStr) return rtnList "downloads files based on URLs. specify" def gDownloadWithFilename (url, savePath, file) as the file name parameter: # Check the parameters. ignore try: urlopen = urllib. URLopener () fp = urlopen. open (url) data = fp. read () fp. close () file = open (savePath + file, 'W + B ') file. write (data) file. close () handle T IOError, error: print "DOWNLOAD % s ERR OR! ==>>> % S "% (url, error) failed t Exception, e: print" Exception ==>> "+ e" download files based on URLs, the file name is automatically obtained from the url "def gDownload (url, savePath): # parameter check. Currently, fileName = gGetFileName (url) # fileName = gRandFilename ('jpg ') is ignored ') gDownloadWithFilename (url, savePath, fileName) "download the jpg" def gDownloadHtmlJpg (downloadUrl, savePath): lines = gGetHtmlLines (downloadUrl) of a webpage based on the url of a webpage) # 'get the page source' regx = r "src \ s * = "? (\ S + )\. jpg "lists = gGetRegList (lines, regx) # 'get the links which match regular express 'if lists = None: return for jpg in lists: jpg = gGetAbslLink (downloadUrl, jpg) + '.jpg 'gDownload (jpg, savePath) print gGetFileName (jpg) "" Get the origin site address based on the url "def gGetHttpAddr (url ): if url = '': return'' arr = url. split ("/") return arr [0] + "//" + arr [2] "retrieve the parent directory based on the url" def gGetHttpAddrFather (url ): if url = '': return' 'Arr = url. split ("/") addr = arr [0] + '//' + arr [2] + '/' if len (arr)-1> 3: for I in range (3, len (arr)-1 ): addr = addr + arr [I] + '/'return addr "get the absolute link address of the link based on the url and the link of the upper level" def gGetHttpAddrFatherAssign (url, link ): if url = '': return'' if link = '': return'' linkArray = link. split ("/") urlArray = url. split ("/") partLink = ''partUrl ='' for I in range (len (linkArray): if linkArray [I] = '.. ': numOfFather = I + 1 # upper level else: partLink = partLink + '/' + linkArray [I] for I in range (len (urlArray)-1-numOfFather ): partUrl = partUrl + urlArray [I] if I <len (urlArray)-1-numOfFather-1: partUrl = partUrl + '/' return partUrl + partLink "" Get related htm and html links based on the url, and return list "def gGetHtmlLink (url ): # parameter check. rtnList = [] lines = gGetHtmlLines (url) regx = r "" href = "? (\ S + )\. htm "for link in gGetRegList (lines, regx): link = gGetAbslLink (url, link) + '.htm' if link not in rtnList: rtnList. append (link) print link return rtnList "" based on the url, capture the jpg on it and the jpg on its link htm "def gDownloadAllJpg (url, savePath ): # parameter check. gDownloadHtmlJpg (url, savePath) is ignored. # capture jpg links = gGetHtmlLink (url) for link in links: gDownloadHtmlJpg (link, savePath) on the link) "test" def main (): u =' http://site.douban.com/196738/room/2462453/ '# Save ='/root/python/tmp/'# print 'download pic from [' + u + '] 'print' save to ['+ save +']... 'gdownloadhtmljpg (u, save) print "download finished" if _ name _ = "_ main _": main () else: print "called from intern."

The above code is a small Editor to introduce you to the python web page to capture all the images and save them to the local content, I hope you like it.

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.