原文地址:http://www.darkbull.net/python/bd/%E5%86%99%E4%B8%AA%E5%9B%BE%E7%89%87%E8%9C%98%E8%9B%9B%E7%8E%A9%E7%8E%A9/
寫了個圖片蜘蛛人玩玩,抓了幾個網頁試試,感覺不不錯。核心的代碼可能20行也不到,簡潔明了,嘻嘻。廢話少說,翠花,上代碼~~
#coding=utf-8import osimport sysimport reimport urllibURL_REG = re.compile(r'(http://[^///]+)', re.I)IMG_REG = re.compile(r'<img[^>]*?src=([/'"])([^/1]*?)/1', re.I)def download(dir, url):'''下載網頁中的圖片@dir 儲存到本地的路徑@url 網頁url'''global URL_REG, IMG_REGm = URL_REG.match(url)if not m: print '[Error]Invalid URL: ', urlreturnhost = m.group(1)if not os.path.isdir(dir):os.mkdir(dir)# 擷取html,提取圖片urlhtml = urllib.urlopen(url).read()imgs = [item[1].lower() for item in IMG_REG.findall(html)]f = lambda path: path if path.startswith('http://') else /host + path if path.startswith('/') else url + '/' + pathimgs = list(set(map(f, imgs)))print '[Info]Find %d images.' % len(imgs)# 下載圖片for idx, img in enumerate(imgs):name = img.split('/')[-1]path = os.path.join(dir, name)try: print '[Info]Download(%d): %s'% (idx + 1, img)urllib.urlretrieve(img, path)except: print "[Error]Cant't download(%d): %s" % (idx + 1, img)def main():if len(sys.argv) != 3:print 'Invalid argument count.'returndir, url = sys.argv[1:]download(dir, url)if __name__ == '__main__':# download('D://Imgs', 'http://www.163.com')main()