Using Python to capture pictures

Source: Internet
Author: User
Tags webfile
This article summarizes three pieces of code for multi-thread batch capturing of Mets produced by Python, mainly to separate the task of retrieving image links and downloading images with threads, in addition, this crawler not only crawls the picture link on the first page, but also allows friends with similar requirements to write a multi-threaded girl figure with reference to the whim. Although the code still has some flaws, but I will record it and share it with you.

Pic_downloader.py

#-*-Coding: UTF-8-*-"Created on Fri Aug 07 17:30:58 2015 @ author: Dreace" import urllib2import sysimport timeimport osimport randomfrom multiprocessing. dummy import Pool as ThreadPool type _ = sys. getfilesystemencoding () def rename (): return time. strftime ("% Y % m % d % H % M % S") def rename_2 (name): if len (name) = 2: name = '0' + name + '.jpg 'elif len (name) = 1: name = '00' + name + '.jpg' else: name = Name + '.jpg 'return namedef download_pic (I): global count global time_out if Filter (I): try: content = urllib2.urlopen (I, timeout = time_out) url_content = content. read () f = open (repr (random. randint (Random, 99999999999) + "_" + rename_2 (repr (count), "wb") f. write (url_content) f. close () count + = 1 failed t Exception, e: print I + "download timeout, skip! ". Decode ("UTF-8 "). encode (type _) def Filter (content): for line in Filter_list: line = line. strip ('\ n') if content. find (line) =-1: return Truedef get_pic (url_address): global pic_list try: str _ = urllib2.urlopen (url_address, timeout = time_out ). read () url_content = str _. split ("\" ") for I in url_content: if I. find (". jpg ")! =-1: pic_list.append (I) failed t Exception, e: print "image retrieval time-out, skip! ". Decode ("UTF-8 "). encode (type _) MAX = 2 count = 0time_out = 60thread_num = 30pic_list = [] page_list = [] Filter_list = ["imgsize.ph.126.net", "img.ph.126.net ", "img2.ph.126.net"] dir_name = "C: \ Photos \" + rename () OS. makedirs (dir_name) OS. chdir (dir_name) start_time = time. time () url_address =" http://sexy.faceks.com/?page= "For I in range (1, MAX + 1): page_list.append (url_address + repr (I) page_pool = ThreadPool (thread_num) page_pool.map (get_pic, page_list) print" get ". decode ("UTF-8 "). encode (type _), len (pic_list), "image, Download Now! ". Decode ("UTF-8 "). encode (type _) pool = ThreadPool (thread_num) pool. map (download_pic, pic_list) pool. close () pool. join () print count, "save images in ". decode ("UTF-8 "). encode (type _) + dir_nameprint "time consumed ". decode ("UTF-8 "). encode (type _), time. time ()-start_time, "s"

Let's take a look at the work of the next netizen.

# Coding: UTF-8 ####################################### ####################### File Name: main. py # Author: mylonly # mail: mylonly@gmail.com # Created Time: wed 11 Jun 2014 08:22:12 pm cst ################################# ######################################## #! /Usr/bin/pythonimport re, urllib2, HTMLParser, threading, Queue, time # link of each Gallery Portal htmlDoorList = [] # Link of Hmtl containing images htmlUrlList = [] # Link of image Url QueueimageUrlList = Queue. queue (0) # Number of captured images imageGetCount = 0 # Number of downloaded images imageDownloadCount = 0 # Start address of each Gallery, used to determine whether to terminate nextHtmlUrl = ''# Local storage path localSavePath = '/data/1920x1080/' # If you want to determine the resolution, modify replace_str, resolution 1920x768,128, x, x replaced _ str = '960x600 '# internal page analysis processing class ImageHtmlParser (HTMLParser. HTMLParser): def _ init _ (self): self. nextUrl = ''HTMLParser. HTMLParser. _ init _ (self) def handle_starttag (self, tag, attrs): global imageUrlListif (tag = 'IMG 'and len (attrs)> 2 ): if (attrs [0] = ('id', 'bigimg '): url = attrs [1] [1] url = url. replace (replaced_str, replace_str) imageUrlList. put (url) global imageGetCountimageGetCount = imageGetCount + 1 print urlelif (tag = 'a' and len (attrs) = 4 ): if (attrs [0] = ('id', 'pagenext ') and attrs [1] = ('class', 'next ')): global nextHtmlUrlnextHtmlUrl = attrs [2] [1]; # homepage analysis class IndexHtmlParser (HTMLParser. HTMLParser): def _ init _ (self): self. urlList = [] self. index = 0self. nextUrl = ''self. tagList = ['Lil', 'A'] self. classList = ['photo-list-padding ', 'Pic'] HTMLParser. HTMLParser. _ init _ (self) def handle_starttag (self, tag, attrs): if (tag = self. tagList [self. index]): for attr in attrs: if (attr [1] = self. classList [self. index]): if (self. index = 0): # The first layer finds self. index = 1 else: # The second layer finds self. index = 0 print attrs [1] [1] self. urlList. append (attrs [1] [1]) breakelif (tag = 'a'): for attr in attrs: if (attr [0] = 'id' and attr [1] = 'pagenext '): self. nextUrl = attrs [1] [1] print 'nexturl: ', self. nextUrlbreak # home page Hmtl parser indexParser = IndexHtmlParser () # inner page Html parser imageParser = ImageHtmlParser () # get all entry links on the home page print 'to start scanning the home page... 'host =' http://desk.zol.com.cn 'Indexurl = '/meinv/' while (indexUrl! = ''): Print 'crawling webpage:', host + indexUrlrequest = urllib2.Request (host + indexUrl) try: m = urllib2.urlopen (request) con = m. read () indexParser. feed (con) if (indexUrl = indexParser. nextUrl): breakelse: indexUrl = indexParser. nextUrlexcept urllib2.URLError, e: print e. reasonprint 'home page scan is complete. All gallery links are obtained: 'htmlorlist = indexParser. urlList # obtain the urlclass getImageUrl (threading. thread): def _ init _ (self): threading. T Hread. _ init _ (self) def run (self): for door in htmlDoorList: print 'start to get the image address. the entry address is :', doorglobal nextHtmlUrlnextHtmlUrl = ''while (door! = ''): Print 'starts to get the image from the webpage % s...' % (host + door) if (nextHtmlUrl! = ''): Request = urllib2.Request (host + nextHtmlUrl) else: request = urllib2.Request (host + door) try: m = urllib2.urlopen (request) con = m. read () imageParser. feed (con) print 'next page address: ', nextHtmlUrlif (door = nextHtmlUrl): breakfailed T urllib2.URLError, e: print e. reasonprint 'all Image addresses have been obtained: ', imageUrlListclass getImage (threading. thread): def _ init _ (self): threading. thread. _ init _ (self) def run (self): global imageUrlListprint 'starts to download the image... 'While (True): print 'current number of captured images:', imageGetCountprint 'number of downloaded images:', imageDownloadCountimage = imageUrlList. get () print 'download file path: ', imagetry: cont = urllib2.urlopen (image ). read () patter = '[0-9] * \. jpg '; match = re. search (patter, image); if match: print 'file being downloaded: ', match. group () filename = localSavePath + match. group () f = open (filename, 'WB ') f. write (cont) f. close () global imageDownloadCountimageDownloadCount = imageDownloadCount + 1 else: print 'no match' if (imageUrlList. empty (): break1_t urllib2.URLError, e: print e. all the reasonprint files have been downloaded... 'Get = getImageUrl () get. start () print 'get image link thread start: 'Time. sleep (2) download = getImage () download. start () print 'image download link thread startup :'


Captures all images on a specified webpage in batches

#-*-Coding: UTF-8-*-# coding = UTF-8 import OS, urllib, urllib2, re url = u" http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1 "Outpath =" t: \ "def getHtml (url): webfile = urllib. urlopen (url) outhtml = webfile. read () print outhtml return outhtml def getImageList (html): restr = ur '('restr + = ur 'http: \/[^ \ s, "] * \. jpg 'restr + = UR' | http: \/[^ \ s, "] * \. jpeg 'restr + = UR' | http: \/[^ \ s, "] * \. png 'restr + = UR' | http: \/[^ \ s, "] * \. gif 'restr + = UR' | http: \/[^ \ s, "] * \. bmp 'restr + = UR' | https: \/[^ \ s, "] * \. jpeg 'restr + = UR' | https: // [^ \ s, "] * \. jpeg 'restr + = UR' | https: // [^ \ s, "] * \. png 'restr + = UR' | https: \ // [^ \ s, "] * \. gif 'restr + = UR' | https: // [^ \ s, "] * \. bmp 'restr + = UR') 'htmlurl = re. compile (restr) imgList = re. findall (htmlurl, html) print imgList return imgList def download (imgList, page): x = 1 for imgurl in imgList: filepathname = str (outpath + 'Pic _ % 09d _ % 010d '% (page, x) + str (OS. path. splitext (urllib2.unquote (imgurl ). decode ('utf8 '). split ('/') [-1]) [1]). lower () print '[Debug] Download file:' + imgurl + '>' + filepathname urllib. urlretrieve (imgurl, filepathname) x + = 1 def downImageNum (pagenum): page = 1 pageNumber = pagenum while (page <= pageNumber): html = getHtml (url) # obtain the html content indicated by the url imageList = getImageList (html) # obtain the addresses of all images and return to the download (imageList, page) list) # Download all images page = page + 1 if _ name _ = '_ main _': downImageNum (1)

The above is the code for batch crawling of sister-paper images implemented by three Python methods. I hope it will be helpful for you to learn Python crawlers.

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.