Python captures images from webpages and saves them to a local device-Python tutorial

Last Update:2017-05-14 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

This article introduces how to capture and save images on a Web page by using python, if you are interested in python web page image capturing, learn it. in the previous article, I will share with you the PHP source code for batch capturing remote Web page images and saving them to a local machine, if you are interested, click to learn more.

#-*-Coding: UTF-8-*-import osimport uuidimport urllib2import cookielib ''' get the file extension ''' def get_file_extension (file): return OS. path. splitext (file) [1] ''' then create the file directory and return the directory ''' def mkdir (path): # Remove spaces on both sides of the path = path. strip () # Remove the tail \ symbol path = path. rstrip ("\") if not OS. path. exists (path): OS. makedirs (path) return path ''' automatically generates a unique string with a fixed length of 36 ''' def unique_str (): return str (uuid. uuid1 () ''' captures the webpage file content and saves it to the memory @ url to capture the file. path + filename ''' def get_file (url): try: cj = cookielib. LWPCookieJar () opener = urllib2.build _ opener (urllib2.HTTPCookieProcessor (cj) urllib2.install _ opener (opener) req = urllib2.Request (url) operate = opener. open (req) data = operate. read () return data into T BaseException, e: print e return None ''' save the file to the local Directory @ path local path @ file_name file name @ data file content ''' def save_file (path, file_name, data): if data = None: return mkdir (path) if (not path. endswith ("/"): path = path + "/" file = open (path + file_name, "wb") file. write (data) file. flush () file. close () # obtain the file suffix print get_file_extension ("123.jpg"); # Create a file directory and return this directory # print mkdir (" d:/ljq ") # automatically generate a unique string with a fixed length of 36 print unique_str () url =" http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0 "; Save_file (" d:/ljq/"," 123.jpg", get_file (url ))

Use Python to capture images in a specified Url and save them to a local device.

# *** Encoding: UTF-8 *** _ author __= 'yangt' "fetch images from specific urlv1.0" import urllib, httplib, urlparse import re import random "judge url exists or not" def httpExists (url): host, path = urlparse. urlsplit (url) [] if ':' in host: # port specified, try to use it host, port = host. split (':', 1) try: port = int (port) counter t ValueError: print 'invalid port number % R' % (port,) ret Urn False else: # no port specified, use default port = None try: connection = httplib. HTTPConnection (host, port = port) connection. request ("HEAD", path) resp = connection. getresponse () if resp. status = 200: # normal 'found' status found = True elif resp. status = 302: # recurse on temporary redirect found = httpExists (urlparse. urljoin (url, resp. getheader ('location', '') else: # everything e Lse-> not found print "Status % d % s: % s" % (resp. status, resp. reason, url) found = False failed t Exception, e: print e. _ class __, e, url found = False return found "get html src, return lines []" "def gGetHtmlLines (url): if url = None: return if not httpExists (url): return try: page = urllib. urlopen (url) html = page. readlines () page. close () return html parse T Exception, e: print "gGetHtmlLines () Error! Exception =>> "+ e return" "get html src, return string" "def gGetHtml (url): if url = None: return if not httpExists (url): return try: page = urllib. urlopen (url) html = page. read () page. close () return html failed t Exception, e: print "gGetHtml () error! Exception => "+ e return" Get file name based on url "def gGetFileName (url): if url = None: return None if url = "": return "" arr = url. split ("/") return arr [len (arr)-1] "generate random file names" def gRandFilename (type ): fname = ''for I in range (16): fname = fname + chr (random. randint (65,90) fname = fname + chr (random. randint (48, 57) return fname + '. '+ type "get the absolute link" def gGetAbslLink (url, link) based on the url and link on it ): If url = None or link = None: return if url = ''or link = '': return url addr = ''if link [0] = '/': addr = gGetHttpAddr (url) + link elif len (link)> 3 and link [0: 4] = 'http': addr = link elif len (link)> 2 and link [0: 2] = '.. ': addr = gGetHttpAddrFatherAssign (url, link) else: addr = gGetHttpAddrFather (url) + link return addr "matches the regular expression based on the input lines, returns list "def gGetRegList (linesList, regx): if line SList = None: return rtnList = [] for line in linesList: matchs = re. search (regx, line, re. IGNORECASE) if matchs! = None: allGroups = matchs. groups () for foundStr in allGroups: if foundStr not in rtnList: rtnList. append (foundStr) return rtnList "downloads files based on URLs. specify" def gDownloadWithFilename (url, savePath, file) as the file name parameter: # Check the parameters. ignore try: urlopen = urllib. URLopener () fp = urlopen. open (url) data = fp. read () fp. close () file = open (savePath + file, 'W + B ') file. write (data) file. close () handle T IOError, error: print "DOWNLOAD % s ERR OR! ==>>> % S "% (url, error) failed t Exception, e: print" Exception ==>> "+ e" download files based on URLs, the file name is automatically obtained from the url "def gDownload (url, savePath): # parameter check. Currently, fileName = gGetFileName (url) # fileName = gRandFilename ('jpg ') is ignored ') gDownloadWithFilename (url, savePath, fileName) "download the jpg" def gDownloadHtmlJpg (downloadUrl, savePath): lines = gGetHtmlLines (downloadUrl) of a webpage based on the url of a webpage) # 'get the page source' regx = r "src \ s * = "? (\ S + )\. jpg "lists = gGetRegList (lines, regx) # 'get the links which match regular express 'if lists = None: return for jpg in lists: jpg = gGetAbslLink (downloadUrl, jpg) + '.jpg 'gDownload (jpg, savePath) print gGetFileName (jpg) "" Get the origin site address based on the url "def gGetHttpAddr (url ): if url = '': return'' arr = url. split ("/") return arr [0] + "//" + arr [2] "retrieve the parent directory based on the url" def gGetHttpAddrFather (url ): if url = '': return' 'Arr = url. split ("/") addr = arr [0] + '//' + arr [2] + '/' if len (arr)-1> 3: for I in range (3, len (arr)-1 ): addr = addr + arr [I] + '/'return addr "get the absolute link address of the link based on the url and the link of the upper level" def gGetHttpAddrFatherAssign (url, link ): if url = '': return'' if link = '': return'' linkArray = link. split ("/") urlArray = url. split ("/") partLink = ''partUrl ='' for I in range (len (linkArray): if linkArray [I] = '.. ': numOfFather = I + 1 # upper level else: partLink = partLink + '/' + linkArray [I] for I in range (len (urlArray)-1-numOfFather ): partUrl = partUrl + urlArray [I] if I <len (urlArray)-1-numOfFather-1: partUrl = partUrl + '/' return partUrl + partLink "" Get related htm and html links based on the url, and return list "def gGetHtmlLink (url ): # parameter check. rtnList = [] lines = gGetHtmlLines (url) regx = r "" href = "? (\ S + )\. htm "for link in gGetRegList (lines, regx): link = gGetAbslLink (url, link) + '.htm' if link not in rtnList: rtnList. append (link) print link return rtnList "" based on the url, capture the jpg on it and the jpg on its link htm "def gDownloadAllJpg (url, savePath ): # parameter check. gDownloadHtmlJpg (url, savePath) is ignored. # capture jpg links = gGetHtmlLink (url) for link in links: gDownloadHtmlJpg (link, savePath) on the link) "test" def main (): u =' http://site.douban.com/196738/room/2462453/ '# Save ='/root/python/tmp/'# print 'download pic from [' + u + '] 'print' save to ['+ save +']... 'gdownloadhtmljpg (u, save) print "download finished" if _ name _ = "_ main _": main () else: print "called from intern."

The above code is a small Editor to introduce you to the python web page to capture all the images and save them to the local content, I hope you like it.

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python captures images from webpages and saves them to a local device-Python tutorial

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Python captures images from webpages and saves them to a local device-Python tutorial

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support