今天實現一個使用python requests模組爬取http://www.mzitu.com/xinggan/網站的妹子圖片,並儲存到本地,效果如下:
先說下思路: 擷取所有的url頁 擷取當前頁的所有url和標題,根據標題建立檔案夾
解析指定url,下載圖片到該url建立的檔案夾中 擷取所有的url
可以看到,這裡一共有81頁,所有的url格式固定,後面跟的就是頁數了
http://www.mzitu.com/xinggan/page/2/
def get_all_urls(self): urls_list = [] for i in range(1,82): url = "http://www.mzitu.com/xinggan/page/{0}".format(i) urls_list.append(url) return urls_list
擷取當前頁的所有url和標題
scrapy shell http://www.mzitu.com/xinggan/page/1
# 擷取當前頁所有的圖片和url儲存到字典返回 def get_title_urls(self,url): map_title_url = {} response = requests.get(url,headers=headers) selector = Selector(text=response.text) url = selector.css("#pins span a::attr(href)").extract() title = selector.css("#pins span a::text").extract() size = len(url) for i in range(size): map_title_url[url[i]] = title[i] return map_title_url
擷取指定url的所有圖片連結 擷取圖片url
response.css(".main-image img::attr(src)").extract()[0]
- 擷取當前url所有圖片連結
可以看到,其實不用訪問所有的url頁面來擷取圖片地址,因為關於一個主題的url圖片存放是有規律的,比如後面都是頁數,因此,我們可以用代碼構造出這些圖片的url 擷取當前的總頁數
可以看到,擷取當前總頁數,只需要擷取 ‘pagenavi’下的倒數第二個span的內容即可
擷取當前主題下的所有圖片url
# 擷取當前主題下的所有圖片urldef get_image_urls(self,url): selector = self.get_selector(url) # 擷取當前主題的圖片總頁數 pages_nav = selector.css(".pagenavi a span").extract() total_page = pages_nav[len(pages_nav)-2] match_obj = re.match("<span>(\d+)</span>", total_page) if match_obj: total_page = match_obj.group(1) # 擷取當前主題的所有圖片url origin_img_url = selector.css(".main-image img::attr(src)").extract()[0] #當前主題首頁的圖片 return self.get_image_urls_step(origin_img_url, total_page)# 按照規律推算當前的主題所有的圖片urldef get_image_urls_step(self,image_url,page_number): import re image_url_list = [] match_obj = re.match('.*/(.*).jpg', image_url) # 解析出當前的圖片尾碼名,用於後面的替換 if match_obj: match_result = match_obj.group(1) for i in range(1, int(page_number)+1): # range方法預設從0開始 if i < 10: next_image_url = match_result.replace(match_result[-2:], "0" + str(i), 1) else: next_image_url = match_result.replace(match_result[-2:], str(i), 1) next_image_url = image_url.replace(match_result, next_image_url, 1) image_url_list.append(next_image_url) return image_url_list
可以看到,已經正確列印出當前的url了
完整代碼
在閱讀完整代碼之前,我還是先屢一下思路: 擷取當前sex模組的所有頁面的url 擷取當前頁的所有圖片url和標題,標題用來建立檔案夾
下載給定url的圖片到本地
下面是完整代碼
# -*- coding: utf-8 -*-import requestsimport reimport osfrom scrapy.selector import Selectorheaders = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0'}class GetSexGirl(object): # 擷取所有的url def get_all_urls(self): urls_list = [] for i in range(1,82): url = "http://www.mzitu.com/xinggan/page/{0}".format(i) # print(url) urls_list.append(url) return urls_list # 擷取當前頁的所有url和標題 def get_title_urls(self,url): map_title_url = {} selector = self.get_selector(url) url = selector.css("#pins span a::attr(href)").extract() title = selector.css("#pins span a::text").extract() size = len(url) for i in range(size): map_title_url[url[i]] = title[i] return map_title_url # 擷取當前主題下的所有圖片url def get_image_urls(self,url): print('get_image_urls url is :'+url) selector = self.get_selector(url) # 擷取當前主題的圖片總頁數 pages_nav = selector.css(".pagenavi a span").extract() total_page = pages_nav[len(pages_nav)-2] match_obj = re.match("<span>(\d+)</span>", total_page) if match_obj: total_page = match_obj.group(1) # 擷取當前主題的所有圖片url origin_img_url = selector.css(".main-image img::attr(src)").extract()[0] #當前主題首頁的圖片 return self.get_image_urls_step(origin_img_url, total_page) # 按照規律推算當前的主題所有的圖片url def get_image_urls_step(self,image_url,page_number): import re image_url_list = [] match_obj = re.match('.*/(.*).jpg', image_url) # 解析出當前的圖片尾碼名,用於後面的替換 if match_obj: match_result = match_obj.group(1) for i in range(1, int(page_number)+1): # range方法預設從0開始 if i < 10: next_image_url = match_result.replace(match_result[-2:], "0" + str(i), 1) else: next_image_url = match_result.replace(match_result[-2:], str(i), 1) next_image_url = image_url.replace(match_result, next_image_url, 1) # print(next_image_url) image_url_list.append(next_image_url) return image_url_list # 下載給定url的圖片到本地 def download_image(self, base_dir, image_url): print('basedir is :'+base_dir+' image_url is :'+image_url) image_name = image_url[-9:-4] # 擷取圖片的名稱, image_url = http://i.meizitu.net/2017/04/24b01.jpg 這種格式 file_name = base_dir+image_name+"{}.jpg".format(image_name) print('file_name is :'+file_name) try: img_response = requests.get(image_url, headers=headers) except: return f = open(file_name, 'ab') f.write(img_response.content) # 多媒體檔案要是用conctent f.close() def get_selector(self,url): response = requests.get(url, headers=headers) selector = Selector(text=response.text) return selectorif __name__ == '__main__': get_sex_girl = GetSexGirl() #擷取當前sex模組的所有頁面url total_url_list = get_sex_girl.get_all_urls() for i in range(len(total_url_list)): # 擷取單個頁面的url和title,title用來建立檔案夾,map_title_url是一個字典 map_title_url = get_sex_girl.get_title_urls('http://www.mzitu.com/xinggan/page/2') urls = map_title_url.keys() for url in urls: #對每一個主題建立一個檔案夾,檔案夾名稱就是title base_dir = "/home/liuhang/code/sexgirl/{0}/".format(map_title_url.get(url)) if os.path.exists(base_dir) == False: print(map_title_url.get(url)+' 主題的檔案夾不存在, 建立它...base_dir is:' + base_dir) # 建立檔案夾存放圖片 os.makedirs(base_dir) #擷取當前頁的所有圖片url地址 image_url_list = get_sex_girl.get_image_urls(url) for i in range(len(image_url_list)): # 儲存當前頁的所有圖片到該主題的目錄下 get_sex_girl.download_image(base_dir, image_url_list[i])
代碼下載