Recently in the reptile section of learning Python. See a netizen in sharing the use of crawler crawling all kinds of Web site's code, also want to write a play. Today took the time to analyze the structure of the sister map and HTML code, using URLLIB2 and BeautifulSoup wrote a automatic download pictures of the sister image of the script.
Sister Figure website: http://www.mzitu.com
The results are as follows:
650) this.width=650; "src=" http://s3.51cto.com/wyfs02/M00/6D/BE/wKiom1VqxezQ2gI5AAIsE59qMbE377.jpg "style=" float: none; "title=" 1.png "alt=" Wkiom1vqxezq2gi5aaise59qmbe377.jpg "/>
650) this.width=650; "src=" http://s3.51cto.com/wyfs02/M01/6D/BA/wKioL1Vqx4PxV3a7AAE-zNgCpXY813.jpg "style=" float: none; "title=" 2.png "alt=" Wkiol1vqx4pxv3a7aae-zngcpxy813.jpg "/>
The source code is as follows, please the Great God teach you:
# -*- coding:utf8 -*-# python: 2.7.8# Platform: Windows# Author: wucl# version: 1.0# program: automatically download pictures of sister images and save to local # history: 2015.5.31import urllib2, os, os.path, urllibfrom bs4 import beautifulsoupdef get_pages (URL): "" " get the number of pages in the sister chart site "" " html=urllib2.urlopen (URL). Read () soup=beautifulsoup (HTML) nums=soup.find_all (' A ', class_= ' page-numbers ' ) pages=int (Nums[-2].text) return pages def get_menU (URL): "" " get the link name and address of all sister chart themes for the page, enter the list " " html=urllib2.urlopen (URL). Read () soup=beautifulsoup (HTML) menu=[] menu_list=soup.find_all (' A ', target= ' _blank ') for i in menu_list: result=i.find_ All (' img ', class_= ' lazy ') if result: name=result[0][' Alt '] address=i[' href '] menu.append ([name,address]) return menudef get_links ( URL): "" " get a single sister chart topic total number of pictures " " html=urllib2.urLopen (URL). Read () soup=beautifulsoup (HTML) all_=soup.find_all (' A ') nums=[] for i in all_: span=i.find_all (' span ') if span: nums.append (Span[0].text) return nums[-2] def get_image (url,filename): "" " extract pictures from a separate page save as filename "" " html=urllib2.urlopen (URL). Read () soup= BeautifulSoup (HTML) image=soup.find_all (' P ') [0].find_all (' img ') [0][' src '] urllib.urlretrieve (image,filename) def main (page): "" " download page of the sister figure &NBsp; "" print u ' downloading page %d ' % page page_url=url+ '/page/' +str (page) menu=get_menu (page_url) print u ' %d page total %d theme ' % (Page,len (menu)) For i in menu: dir_name=os.path.join (' MeiZiTu ', I [0]) if not os.path.exists (Dir_name): os.mkdir (Dir_name) pic_nums=int (Get_links (i[1)) print u ' theme %s altogether has %d picture ' % (i[0],pic_nums) for Pic in range (1,pic_nums+1): Basename=str (pic) + '. JPG ' filename=os.path.join (Dir_name, basename) pic_url=i[1]+ '/' +str (pic) if not os.path.exists ( FileName): Print u ' ........ You are downloading %s ' % basename get_image (Pic_url,filename) else: print filename+u ' already exists, skipped ' if __name__== ' __main__ ': url= ' http://www.mzitu.com/' pages=get_pages (URL) Print u ' sister figure there are &Nbsp;%d page ' %pages if not os.path.exists (' MeiZiTu '): os.mkdir (' Meizitu ') page_start=input (U ') Enter the start page of the sister graph: \ n ') page_end=input (U ' Enter the end page of the sister graph: \ n ') if page_end>page_ Start: for page in range (page_start,page_end): main (page) elif page_end==page_start: main (page_end) else: print u "input error, start Page must be less than or equal to end page \ n"
This article is from the "Wine similarities Lake" blog, please be sure to keep this source http://wucl202000.blog.51cto.com/4687508/1656871
Use Python to download pictures of Sister map sites