python項目之 爬蟲爬取煎蛋jandan的妹子圖-上
抓取妹子圖練練手。 網頁url格式
http://jandan.net/ooxx/page-1777#comment
只需改變頁碼1777即可 分析頁面源碼發現妹子圖有兩個
一個是縮圖
<img src="http://ww1.sinaimg.cn/mw600/4bf31e43jw1f09htnzkh5j20dw0kumz0.jpg" /></p>
另一個是原圖
<a href="http://ww1.sinaimg.cn/large/4bf31e43jw1f09htnzkh5j20dw0kumz0.jpg" target="_blank" class="view_img_link">[查看原圖]</a>
這裡我們抓取原圖,使用class和target這個屬性尋找。 最終得到每一頁的TXT檔案,下篇是檔案合并與圖片存取。 源碼如下
代理ip檔案請自行尋找:-D
# coding:utf-8##################################################### coding by 劉雲飛####################################################import requestsimport osimport timeimport randomfrom bs4 import BeautifulSoupimport threadingurl = "http://jandan.net/ooxx/page-"img_lists = []url_lists = []not_url_lists = []ips = []thread_list = []with open('ip2.txt', 'r') as f: lines = f.readlines() for line in lines: ip_one = "http://" + line.strip() ips.append(ip_one)headers = { 'Host': 'jandan.net', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/42.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Referer': 'http://jandan.net/ooxx/', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0',}for i in range(1530, 1883): url_lists.append(url + str(i) + '#comments')def writeToTxt(name, list): with open(name, 'w+') as f: for urlOne in list: f.write(urlOne + "\n")def get_img_url(url): single_ip_addr = random.choice(ips) lists_tmp = [] page = int(url[28:32]) filename = str(page) + ".txt" proxies = {'http': single_ip_addr} try: res = requests.get(url, headers=headers, proxies=proxies) print(res.status_code) if res.status_code == 200: text = res.text Soup = BeautifulSoup(text, 'lxml') results = Soup.find_all("a", target="_blank", class_="view_img_link") for img in results: lists_tmp.append(img['href']) url_lists.append(img['href']) print(url + " --->>>>抓取完畢。。") writeToTxt(filename, lists_tmp) else: not_url_lists.append(url) print("not ok") except: not_url_lists.append(url) print("not ok")for url in url_lists: page = int(url[28:32]) filename = str(page) + ".txt" if os.path.exists(filename): print(url + " is pass") else: # time.sleep(1) get_img_url(url)print(img_lists)with open("img_url.txt", 'w+') as f: for url in img_lists: f.write(url + "\n")print("共有 " + str(len(img_lists)) + " 張圖片。")print("all done!!!")with open("not_url_lists.txt", 'w+') as f: for url in not_url_lists: f.write(url + "\n")