python project crawler crawls Egg Jandan's sister chart-on
Grab a sister figure practice practicing. Web page URL format
Http://jandan.net/ooxx/page-1777#comment
Just change the page number 1777 to analyze the source found that the sister figure has two
One is a thumbnail image
</p>
The other is the original artwork
<a href= "http://ww1.sinaimg.cn/large/4bf31e43jw1f09htnzkh5j20dw0kumz0.jpg" target= _blank "class=" view_img_ Link ">[View original]</a>
Here we crawl the original image, using the class and Target attribute to find it. finally get each page TXT file, the next is file merging and image access. Source code is as follows
Proxy IP File Find yourself:-D
# Coding:utf-8 #################################################### # Coding by Liu Yunfei ################################# ################### Import Requests Import OS import time import random from BS4 import BeautifulSoup import threading U RL = "http://jandan.net/ooxx/page-" img_lists = [] url_lists = [] not_url_lists = [] ips = [] Thread_list = [] with open ( ' Ip2.txt ', ' R ') as F:lines = F.readlines () for line in Lines:ip_one = "http://" + line.strip () I Ps.append (ip_one) headers = {' Host ': ' jandan.net ', ' user-agent ': ' mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) gecko/20100101 firefox/42.0 ', ' Accept ': ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ' , ' accept-language ': ' zh-cn,zh;q=0.8 ', ' accept-encoding ': ' gzip, deflate, sdch ', ' Referer ': ' Http://jandan.net /ooxx/', ' Connection ': ' keep-alive ', ' cache-control ': ' Max-age=0 ',} for-I in range (1530, 1883): Url_lists.ap Pend (url + str (i) + ' #comments ') def writEtotxt (name, list): With open (name, ' w+ ') as F:for Urlone in List:f.write (Urlone + "\ n") def G Et_img_url (URL): single_ip_addr = Random.choice (ips) lists_tmp = [] page = Int (url[28:32)) filename = str ( Page + ". txt" proxies = {' http ': single_ip_addr} try:res = Requests.get (URL, headers=headers, PROXIES=PR oxies) Print (res.status_code) if Res.status_code = = 200:text = Res.text Soup = be Autifulsoup (text, ' lxml ') results = Soup.find_all ("A", target= "_blank", class_= "View_img_link") fo
R img in results:lists_tmp.append (img[' href ']) url_lists.append (img[' href ') Print (URL + "--->>>> crawl complete. ") writetotxt (filename, lists_tmp) else:not_url_lists.append (URL) print (" Not OK ") except:not_url_lists.append (URL) print (' Not OK ') for URL in Url_lists:paGE = int (url[28:32]) filename = str (page) + ". txt" if os.path.exists (filename): print (URL + ' is pass ') else: # time.sleep (1) get_img_url (URL) print (img_lists) with open ("Img_url.txt", ' w+ ') as F:for URL in img_lists:f.write (url + "\ n") print ("Total" + str (img_lists) + "picture.)
") Print (" All done!!! ") With open ("Not_url_lists.txt", ' w+ ') as F:for URL in not_url_lists:f.write (url + "\ n")