Python captures images on a webpage and saves them to a local device,

Source: Internet
Author: User

Python captures images on a webpage and saves them to a local device,

In the previous article, I will share with you how to batch capture remote web page images from the PHP source code and save them to a local machine. If you are interested, click here to learn more.

#-*-Coding: UTF-8-*-import osimport uuidimport urllib2import cookielib ''' get the file extension ''' def get_file_extension (file): return OS. path. splitext (file) [1] ''' then create the file directory and return the directory ''' def mkdir (path): # Remove spaces on both sides of the path = path. strip () # Remove the tail \ symbol path = path. rstrip ("\") if not OS. path. exists (path): OS. makedirs (path) return path ''' automatically generates a unique string with a fixed length of 36 ''' def unique_str (): return str (uuid. uuid1 () ''' captures the webpage file content and saves it To the memory @ url to capture the file, Path + filename ''' def get_file (url): try: cj = cookielib. LWPCookieJar () opener = urllib2.build _ opener (urllib2.HTTPCookieProcessor (cj) urllib2.install _ opener (opener) req = urllib2.Request (url) operate = opener. open (req) data = operate. read () return data into T BaseException, e: print e return None ''' save the file to the local directory @ path local path @ file_name file name @ data file content ''' def save_file (path, file_name, data): if data = None: return mkdir (Path) if (not path. endswith ("/"): path = path + "/" file = open (path + file_name, "wb") file. write (data) file. flush () file. close () # obtain the file suffix print get_file_extension ("123.jpg"); # create a file directory and return this directory # print mkdir (" d:/ljq ") # automatically generate a unique string with a fixed length of 36 print unique_str () url = "http://qlogo1.store.qq.com/qzone/416501600/416501600/100? 0 "; save_file (" d:/ljq/"," 123.jpg", get_file (url ))

Use Python to capture images in a specified Url and save them to a local device.

# *** Encoding: UTF-8 *** _ author __= 'yangt' "fetch images from specific urlv1.0" import urllib, httplib, urlparse import re import random "judge url exists or not" def httpExists (url): host, path = urlparse. urlsplit (url) [] if ':' in host: # port specified, try to use it host, port = host. split (':', 1) try: port = int (port) Counter t ValueError: print 'invalid port number % R' % (port,) ret Urn False else: # no port specified, use default port = None try: connection = httplib. HTTPConnection (host, port = port) connection. request ("HEAD", path) resp = connection. getresponse () if resp. status = 200: # normal 'found' status found = True elif resp. status = 302: # recurse on temporary redirect found = httpExists (urlparse. urljoin (url, resp. getheader ('location', '') else: # everything e Lse-> not found print "Status % d % s: % s" % (resp. status, resp. reason, url) found = False failed t Exception, e: print e. _ class __, e, url found = False return found "get html src, return lines []" "def gGetHtmlLines (url): if url = None: return if not httpExists (url): return try: page = urllib. urlopen (url) html = page. readlines () page. close () return html parse t Exception, e: print "gGetHtmlLines () Error! Exception =>> "+ e return" "get html src, return string" "def gGetHtml (url): if url = None: return if not httpExists (url): return try: page = urllib. urlopen (url) html = page. read () page. close () return html failed t Exception, e: print "gGetHtml () error! Exception => "+ e return" Get file name based on url "def gGetFileName (url): if url = None: return None if url = "": return "" arr = url. split ("/") return arr [len (arr)-1] "generate random file names" def gRandFilename (type ): fname = ''for I in range (16): fname = fname + chr (random. randint (65,90) fname = fname + chr (random. randint (48, 57) return fname + '. '+ type "Get the absolute link" def gGetAbslLink (url, link) based on the url and link on it ): If url = None or link = None: return if url = ''or link = '': return url addr = ''if link [0] = '/': addr = gGetHttpAddr (url) + link elif len (link)> 3 and link [0: 4] = 'http': addr = link elif len (link)> 2 and link [0: 2] = '.. ': addr = gGetHttpAddrFatherAssign (url, link) else: addr = gGetHttpAddrFather (url) + link return addr "matches the regular expression based on the input lines, returns list "def gGetRegList (linesList, regx): if line SList = None: return rtnList = [] for line in linesList: matchs = re. search (regx, line, re. IGNORECASE) if matchs! = None: allGroups = matchs. groups () for foundStr in allGroups: if foundStr not in rtnList: rtnList. append (foundStr) return rtnList "downloads files based on URLs. Specify" def gDownloadWithFilename (url, savePath, file) as the file name parameter: # Check the parameters. Ignore try: urlopen = urllib. URLopener () fp = urlopen. open (url) data = fp. read () fp. close () file = open (savePath + file, 'W + B ') file. write (data) file. close () handle T IOError, error: print "DOWNLOAD % s ERR OR! ==>>> % S "% (url, error) failed t Exception, e: print" Exception ==>> "+ e" download files based on URLs, the file name is automatically obtained from the url "def gDownload (url, savePath): # parameter check. Currently, fileName = gGetFileName (url) # fileName = gRandFilename ('jpg ') is ignored ') gDownloadWithFilename (url, savePath, fileName) "download the jpg" def gDownloadHtmlJpg (downloadUrl, savePath): lines = gGetHtmlLines (downloadUrl) of a webpage based on the url of a webpage) # 'get the page source' regx = r "src \ s * = "? (\ S + )\. jpg "lists = gGetRegList (lines, regx) # 'get the links which match regular express 'if lists = None: return for jpg in lists: jpg = gGetAbslLink (downloadUrl, jpg) + '.jpg 'gDownload (jpg, savePath) print gGetFileName (jpg) "" Get the origin site address based on the url "def gGetHttpAddr (url ): if url = '': return'' arr = url. split ("/") return arr [0] + "//" + arr [2] "retrieve the parent directory based on the url" def gGetHttpAddrFather (url ): if url = '': return' 'Arr = url. split ("/") addr = arr [0] + '//' + arr [2] + '/' if len (arr)-1> 3: for I in range (3, len (arr)-1 ): addr = addr + arr [I] + '/'Return addr "Get the absolute link address of the link based on the url and the link of the upper level" def gGetHttpAddrFatherAssign (url, link ): if url = '': return'' if link = '': return'' linkArray = link. split ("/") urlArray = url. split ("/") partLink = ''partUrl ='' for I in range (len (linkArray): if linkArray [I] = '.. ': numOfFather = I + 1 # upper level else: partLink = partLink + '/' + linkArray [I] for I in range (len (urlArray)-1-numOfFather ): partUrl = partUrl + urlArray [I] if I <len (urlArray)-1-numOfFather-1: partUrl = partUrl + '/' return partUrl + partLink "" Get related htm and html links based on the url, and return list "def gGetHtmlLink (url ): # parameter check. rtnList = [] lines = gGetHtmlLines (url) regx = r "" href = "? (\ S + )\. htm "for link in gGetRegList (lines, regx): link = gGetAbslLink (url, link) + '.htm' if link not in rtnList: rtnList. append (link) print link return rtnList "" based on the url, capture the jpg on it and the jpg on its link htm "def gDownloadAllJpg (url, savePath ): # parameter check. gDownloadHtmlJpg (url, savePath) is ignored. # capture jpg links = gGetHtmlLink (url) for link in links: gDownloadHtmlJpg (link, savePath) on the link) "test" def main (): u = 'HTTP: // site.douban.com/196738/room/2462453/'save = '/root/python/tmp/' # print 'Download pic from ['+ u +'] 'print 'save to [' + 'save + ']... 'gdownloadhtmljpg (u, save) print "download finished" if _ name _ = "_ main _": main () else: print "called from intern."

The above code is a small Editor to introduce you to the python web page to capture all the images and save them to the local content, I hope you like it.

Articles you may be interested in:
  • Python captures web images and places them in a specified folder
  • Python web image capture example (python crawler)
  • Use Python3 to write a script for capturing webpages and only webpage Images

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.