標籤:爬蟲 python
python 爬蟲爬取美女圖片
#coding=utf-8import urllibimport reimport osimport timeimport threadingdef getHtml(url): page = urllib.urlopen(url) html = page.read() return htmldef getImgUrl(html,src): srcre = re.compile(src) srclist = re.findall(srcre,html) return srclistdef getImgPage(html): url = r'http://.*\.html' urlre = re.compile(url) urllist = re.findall(urlre,html) return urllistdef downloadImg(url): html = getHtml(url) src = r'rel=.*\.jpg' srclist = getImgUrl(html,src) srclist2 = [] for srcs in srclist: temp = srcs.replace("'",'"') temp = temp.split('"') srclist2.append(temp[1]) for srcurl in srclist2: imgName = srcurl.replace(':','_') imgName = imgName.replace('/','_') print 'download pic %s .........' % srcurl if os.path.isfile('pic/%s' % imgName): continue urllib.urlretrieve(srcurl,'pic/%s' % imgName)class MyThread(threading.Thread): def __init__(self,urllist): threading.Thread.__init__(self) self.urllist = urllist def run(self): for u in self.urllist: downloadImg(u)def main(): url = 'http://www.6188.net/' html = getHtml(url) urllist = getImgPage(html) urllist2 = [] length = len(urllist) / 7 for i in range(1,8): temp = urllist[(i-1)*length:i*length] urllist2.append(temp) for u in urllist2: t = MyThread(u) t.start()main()
python爬蟲爬取美女圖片