This article to share is the use of Python crawler crawl lazy Gallery of the JS Script effect template code, using the Third-party library gevent to achieve, the need for small partners can refer to.
This is a simple Python script, mainly from the lazy library download JavaScript effects template, in the script used gevent this Third-party library, the use of the need to install first.
?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26-27--28 29---30 31--32 33 34 35 36 37 38-39 40 41 42 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
#!/usr/bin/python #-*-coding:utf-8-*- Import urllib,os,sys import Gevent,re From gevent import monkey from BS4 import BeautifulSoup gevent.monkey.patch_socket () ' Description:python crawler crawl Lazy Man Graphics Library JS script template author:admin create-date:2015-05-25 version:1.0 ' http_url = ' http://www.lanrentuku.com%s ' Download_ URL = http_url[:-2] + '/js/d%szip ' reg=r ' d{1,}.+ ' def encode (text): Return Text.encode ("UTF8") def Createdir Ectory (curpath): MyPath = Os.path.join (Getsubdirectory (), U ' JS code template ') if not os.path.exists (MyPath): Os.mkdir (MyPath) Return Os.path.join (MyPath, Curpath) def getsubdirectory (): Return OS.GETCWD () def schedule (A, B, c): per = 100.0 * A * b/c if > 100:per = Sys.stdout.write ('%.1f%%r '% per) Sys.stdout.flush () Def geturllist (U RL): Url_list = {} html = urllib.urlopen (URL) content = Html.read () html.close () # parse beautifulsoup with decodehtml = Beautiful Soup (content) Try:atags = decodeHtml.find_all (' div ', {' class ': ' List-pngjs '}) [0].find_all (' a ') except Indexerror, e:print e atags = None # Get link address and title if at AGS is isn't none:for A_tag in atags:url_list[http_url% a_tag.get (' href ')] = A_tag.get_text () return url_list def Download (down_url): Try:m=re.search (reg,down_url[0]) name = download_url% m.group (0) urllib.urlretrieve (name, CreateDirectory (Down_url[1] + name[-4:]), schedule) except Exception, E:print e.message def getpageurl (Xurl): # Make columns Table page loop return [xurl% page to page in Xrange (1,49)] if __name__ = = ' __main__ ': Jobs = [] Pageurl = Getpageurl (' http: Www.lanrentuku.com/js/p%s.html ') # Crawl all links for I-in pageurl:for K-Geturllist (i). Items (): Jobs.append gevent.spawn (Do Wnload, K)) Gevent.joinall (jobs) |
The above is the entire contents of this article, I hope you can enjoy.