0
1.
[Code][Python] code
# Coding = utf-8import requestsimport refrom lxml import etreeimport timeimport sysreload (sys) sys. setdefaultencoding ("UTF-8") # define a crawler class spider (object): def _ init _ (self): print U' to start crawling content... '# Getsource is used to obtain the webpage source code def getsource (self, url): html = requests. get (url) return html. text # changepage: def changepage (self, url, total_page): now_page = int (re. search ('index _ (\ d +) ', url, re. S ). group (1) # you can modify page_group = [] for I in range (now_page, total_page + 1): link = re. sub ('index _ \ d + ', 'index _ % s' % I, url, re. s) # you can modify page_group.append (link) return page_group # getpic is used to crawl a webpage image def getpic (self, source): selector = etree. HTML (source) pic_url = selector. xpath ('// ul [@ class = "ali"]/li/p/a/img/@ src ') # You can modify return pic_url # savepic to save the result to the pic folder def savepic (self, pic_url): picname = re. findall ('(\ d +)', link, re. s) # you can modify picnamestr = ''. join (picname) I = 0 for each in pic_url: print 'Now downloading: '+ each pic = requests. get (each) fp = open ('Pic \ '+ picnamestr +'-'+ str (I) + '.jpg', 'wb') fp. write (pic. content) fp. close () I + = 1 # ppic collection class method def ppic (self, link): print U' processing page: '+ link html = picspider. getsource (link) pic_url = picspider. getpic (html) picspider. savepic (pic_url) time1 = time. time () if _ name _ = '_ main _': url =' http://www.ivsky.com/tupian/ziranfengguang/index_1.html '# Modifiable picspider = spider () all_links = picspider. changepage (url, 3) # for link in all_links: picspider. ppic (link) time2 = time. time () print u'time consumed: '+ str (time2-time1)
2.
[File]Picspider. py ~ 2 KB
# Coding = utf-8import requestsimport refrom lxml import etreeimport timeimport sysreload (sys) sys. setdefaultencoding ("UTF-8") # define a crawler class spider (object): def _ init _ (self): print U' to start crawling content... '# Getsource is used to obtain the webpage source code def getsource (self, url): html = requests. get (url) return html. text # changepage: def changepage (self, url, total_page): now_page = int (re. search ('index _ (\ d +) ', url, re. S ). group (1) # you can modify page_group = [] for I in range (now_page, total_page + 1): link = re. sub ('index _ \ d + ', 'index _ % s' % I, url, re. s) # you can modify page_group.append (link) return page_group # getpic is used to crawl a webpage image def getpic (self, source): selector = etree. HTML (source) pic_url = selector. xpath ('// ul [@ class = "ali"]/li/p/a/img/@ src ') # You can modify return pic_url # savepic to save the result to the pic folder def savepic (self, pic_url): picname = re. findall ('(\ d +)', link, re. s) # you can modify picnamestr = ''. join (picname) I = 0 for each in pic_url: print 'Now downloading: '+ each pic = requests. get (each) fp = open ('Pic \ '+ picnamestr +'-'+ str (I) + '.jpg', 'wb') fp. write (pic. content) fp. close () I + = 1 # ppic collection class method def ppic (self, link): print U' processing page: '+ link html = picspider. getsource (link) pic_url = picspider. getpic (html) picspider. savepic (pic_url) time1 = time. time () if _ name _ = '_ main _': url =' http://www.ivsky.com/tupian/ziranfengguang/index_1.html '# Modifiable picspider = spider () all_links = picspider. changepage (url, 3) # for link in all_links: picspider. ppic (link) time2 = time. time () print u'time consumed: '+ str (time2-time1)
3.
[Image]1. png