#! /Usr/bin/python #-*-Coding: UTF-8 -*- Import urllib, OS, sys Import gevent, re From gevent import monkey From bs4 import BeautifulSoup Gevent. monkey. patch_socket () ''' Description: The Python crawler crawls the JS Script Template of the lazy Image Library. Author: admin Create-Date: 2015-05-25 Version: 1.0 ''' HTTP_URL = 'HTTP: // www.lanw.uku.com % s' DOWNLOAD_URL = HTTP_URL [:-2] + '/js/d % szip' Reg = R' \ d {1,} \. +' Def encode (text ): Return text. encode ("utf8 ") Def createDirectory (curPath ): MyPath = OS. path. join (getSubDirectory (), u 'js code template ') If not OS. path. exists (myPath ): OS. mkdir (myPath) Return OS. path. join (myPath, curPath) Def getSubDirectory (): Return OS. getcwd () Def schedule (a, B, c ): Per = 100.0 * a * B/c If per> 100: Per = 100. Sys. stdout. write ('%. 1f % \ R' % per) Sys. stdout. flush () Def geturllist (url ): Url_list = {} Html = urllib. urlopen (url) Content = html. read () Html. close () # Use BeautifulSoup for parsing DecodeHtml = BeautifulSoup (content) Try: ATags = decodeHtml. find_all ('div ', {'class': 'LIST-pngjs'}) [0]. find_all ('A ') Failed t IndexError, e: Print e ATags = None # Obtain the link address and title If aTags is not None: For a_tag in aTags: Url_list [HTTP_URL % a_tag.get ('href ')] = a_tag.get_text () Return url_list Def download (down_url ): Try: M = re. search (reg, down_url [0]) Name = DOWNLOAD_URL % m. group (0) Urllib. urlretrieve (name, createDirectory (down_url [1] + name [-4:]), schedule) Except t Exception, e: Print e. message Def getpageurl (xUrl ): # List page Loop Return [xUrl % page for page in xrange (1, 49)] If _ name _ = '_ main __': Jobs = [] Pageurl = getpageurl ('HTTP: // www.lanw.uku.com/js/p?s.html ') # Crawling all links For I in pageurl: For k in geturllist (I). items (): Jobs. append (gevent. spawn (download, k )) Gevent. joinall (jobs) |