Using Python to capture pictures

Last Update:2018-04-26 Source: Internet

Author: User

Tags webfile

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

This article summarizes three pieces of code for multi-thread batch capturing of Mets produced by Python, mainly to separate the task of retrieving image links and downloading images with threads, in addition, this crawler not only crawls the picture link on the first page, but also allows friends with similar requirements to write a multi-threaded girl figure with reference to the whim. Although the code still has some flaws, but I will record it and share it with you.

Pic_downloader.py

#-*-Coding: UTF-8-*-"Created on Fri Aug 07 17:30:58 2015 @ author: Dreace" import urllib2import sysimport timeimport osimport randomfrom multiprocessing. dummy import Pool as ThreadPool type _ = sys. getfilesystemencoding () def rename (): return time. strftime ("% Y % m % d % H % M % S") def rename_2 (name): if len (name) = 2: name = '0' + name + '.jpg 'elif len (name) = 1: name = '00' + name + '.jpg' else: name = Name + '.jpg 'return namedef download_pic (I): global count global time_out if Filter (I): try: content = urllib2.urlopen (I, timeout = time_out) url_content = content. read () f = open (repr (random. randint (Random, 99999999999) + "_" + rename_2 (repr (count), "wb") f. write (url_content) f. close () count + = 1 failed t Exception, e: print I + "download timeout, skip! ". Decode ("UTF-8 "). encode (type _) def Filter (content): for line in Filter_list: line = line. strip ('\ n') if content. find (line) =-1: return Truedef get_pic (url_address): global pic_list try: str _ = urllib2.urlopen (url_address, timeout = time_out ). read () url_content = str _. split ("\" ") for I in url_content: if I. find (". jpg ")! =-1: pic_list.append (I) failed t Exception, e: print "image retrieval time-out, skip! ". Decode ("UTF-8 "). encode (type _) MAX = 2 count = 0time_out = 60thread_num = 30pic_list = [] page_list = [] Filter_list = ["imgsize.ph.126.net", "img.ph.126.net ", "img2.ph.126.net"] dir_name = "C: \ Photos \" + rename () OS. makedirs (dir_name) OS. chdir (dir_name) start_time = time. time () url_address =" http://sexy.faceks.com/?page= "For I in range (1, MAX + 1): page_list.append (url_address + repr (I) page_pool = ThreadPool (thread_num) page_pool.map (get_pic, page_list) print" get ". decode ("UTF-8 "). encode (type _), len (pic_list), "image, Download Now! ". Decode ("UTF-8 "). encode (type _) pool = ThreadPool (thread_num) pool. map (download_pic, pic_list) pool. close () pool. join () print count, "save images in ". decode ("UTF-8 "). encode (type _) + dir_nameprint "time consumed ". decode ("UTF-8 "). encode (type _), time. time ()-start_time, "s"

Let's take a look at the work of the next netizen.

# Coding: UTF-8 ####################################### ####################### File Name: main. py # Author: mylonly # mail: mylonly@gmail.com # Created Time: wed 11 Jun 2014 08:22:12 pm cst ################################# ######################################## #! /Usr/bin/pythonimport re, urllib2, HTMLParser, threading, Queue, time # link of each Gallery Portal htmlDoorList = [] # Link of Hmtl containing images htmlUrlList = [] # Link of image Url QueueimageUrlList = Queue. queue (0) # Number of captured images imageGetCount = 0 # Number of downloaded images imageDownloadCount = 0 # Start address of each Gallery, used to determine whether to terminate nextHtmlUrl = ''# Local storage path localSavePath = '/data/1920x1080/' # If you want to determine the resolution, modify replace_str, resolution 1920x768,128, x, x replaced _ str = '960x600 '# internal page analysis processing class ImageHtmlParser (HTMLParser. HTMLParser): def _ init _ (self): self. nextUrl = ''HTMLParser. HTMLParser. _ init _ (self) def handle_starttag (self, tag, attrs): global imageUrlListif (tag = 'IMG 'and len (attrs)> 2 ): if (attrs [0] = ('id', 'bigimg '): url = attrs [1] [1] url = url. replace (replaced_str, replace_str) imageUrlList. put (url) global imageGetCountimageGetCount = imageGetCount + 1 print urlelif (tag = 'a' and len (attrs) = 4 ): if (attrs [0] = ('id', 'pagenext ') and attrs [1] = ('class', 'next ')): global nextHtmlUrlnextHtmlUrl = attrs [2] [1]; # homepage analysis class IndexHtmlParser (HTMLParser. HTMLParser): def _ init _ (self): self. urlList = [] self. index = 0self. nextUrl = ''self. tagList = ['Lil', 'A'] self. classList = ['photo-list-padding ', 'Pic'] HTMLParser. HTMLParser. _ init _ (self) def handle_starttag (self, tag, attrs): if (tag = self. tagList [self. index]): for attr in attrs: if (attr [1] = self. classList [self. index]): if (self. index = 0): # The first layer finds self. index = 1 else: # The second layer finds self. index = 0 print attrs [1] [1] self. urlList. append (attrs [1] [1]) breakelif (tag = 'a'): for attr in attrs: if (attr [0] = 'id' and attr [1] = 'pagenext '): self. nextUrl = attrs [1] [1] print 'nexturl: ', self. nextUrlbreak # home page Hmtl parser indexParser = IndexHtmlParser () # inner page Html parser imageParser = ImageHtmlParser () # get all entry links on the home page print 'to start scanning the home page... 'host =' http://desk.zol.com.cn 'Indexurl = '/meinv/' while (indexUrl! = ''): Print 'crawling webpage:', host + indexUrlrequest = urllib2.Request (host + indexUrl) try: m = urllib2.urlopen (request) con = m. read () indexParser. feed (con) if (indexUrl = indexParser. nextUrl): breakelse: indexUrl = indexParser. nextUrlexcept urllib2.URLError, e: print e. reasonprint 'home page scan is complete. All gallery links are obtained: 'htmlorlist = indexParser. urlList # obtain the urlclass getImageUrl (threading. thread): def _ init _ (self): threading. T Hread. _ init _ (self) def run (self): for door in htmlDoorList: print 'start to get the image address. the entry address is :', doorglobal nextHtmlUrlnextHtmlUrl = ''while (door! = ''): Print 'starts to get the image from the webpage % s...' % (host + door) if (nextHtmlUrl! = ''): Request = urllib2.Request (host + nextHtmlUrl) else: request = urllib2.Request (host + door) try: m = urllib2.urlopen (request) con = m. read () imageParser. feed (con) print 'next page address: ', nextHtmlUrlif (door = nextHtmlUrl): breakfailed T urllib2.URLError, e: print e. reasonprint 'all Image addresses have been obtained: ', imageUrlListclass getImage (threading. thread): def _ init _ (self): threading. thread. _ init _ (self) def run (self): global imageUrlListprint 'starts to download the image... 'While (True): print 'current number of captured images:', imageGetCountprint 'number of downloaded images:', imageDownloadCountimage = imageUrlList. get () print 'download file path: ', imagetry: cont = urllib2.urlopen (image ). read () patter = '[0-9] * \. jpg '; match = re. search (patter, image); if match: print 'file being downloaded: ', match. group () filename = localSavePath + match. group () f = open (filename, 'WB ') f. write (cont) f. close () global imageDownloadCountimageDownloadCount = imageDownloadCount + 1 else: print 'no match' if (imageUrlList. empty (): break1_t urllib2.URLError, e: print e. all the reasonprint files have been downloaded... 'Get = getImageUrl () get. start () print 'get image link thread start: 'Time. sleep (2) download = getImage () download. start () print 'image download link thread startup :'

Captures all images on a specified webpage in batches

#-*-Coding: UTF-8-*-# coding = UTF-8 import OS, urllib, urllib2, re url = u" http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1 "Outpath =" t: \ "def getHtml (url): webfile = urllib. urlopen (url) outhtml = webfile. read () print outhtml return outhtml def getImageList (html): restr = ur '('restr + = ur 'http: \/[^ \ s, "] * \. jpg 'restr + = UR' | http: \/[^ \ s, "] * \. jpeg 'restr + = UR' | http: \/[^ \ s, "] * \. png 'restr + = UR' | http: \/[^ \ s, "] * \. gif 'restr + = UR' | http: \/[^ \ s, "] * \. bmp 'restr + = UR' | https: \/[^ \ s, "] * \. jpeg 'restr + = UR' | https: // [^ \ s, "] * \. jpeg 'restr + = UR' | https: // [^ \ s, "] * \. png 'restr + = UR' | https: \ // [^ \ s, "] * \. gif 'restr + = UR' | https: // [^ \ s, "] * \. bmp 'restr + = UR') 'htmlurl = re. compile (restr) imgList = re. findall (htmlurl, html) print imgList return imgList def download (imgList, page): x = 1 for imgurl in imgList: filepathname = str (outpath + 'Pic _ % 09d _ % 010d '% (page, x) + str (OS. path. splitext (urllib2.unquote (imgurl ). decode ('utf8 '). split ('/') [-1]) [1]). lower () print '[Debug] Download file:' + imgurl + '>' + filepathname urllib. urlretrieve (imgurl, filepathname) x + = 1 def downImageNum (pagenum): page = 1 pageNumber = pagenum while (page <= pageNumber): html = getHtml (url) # obtain the html content indicated by the url imageList = getImageList (html) # obtain the addresses of all images and return to the download (imageList, page) list) # Download all images page = page + 1 if _ name _ = '_ main _': downImageNum (1)

The above is the code for batch crawling of sister-paper images implemented by three Python methods. I hope it will be helpful for you to learn Python crawlers.

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More