Import urllib.requestimport urllib.parsefrom lxml import etreedef loadPage (URL): "" "Function: to send a request based on a URL, get the server response file URL: URL address "" #print URL to crawl #美女 # headers = {"User-agent": "mozilla/5.0" (Macintosh; Intel Mac OS X 10_7_0) applewebkit/535.11 (khtml, like Gecko) chrome/17.0.963.56 safari/535.11 "} request = URLLIB.REQUEST.R Equest (URL) HTML = urllib.request.urlopen (Request). Read () # parse HTML document for HTML DOM model content = Etree. HTML (HTML) #print content # returns all matching successful list collections link_list = Content.xpath ('//div[@class = ' T_con cleafix ']/div/div/div/a/@href ') #link_list = Content.xpath ('//a[@class = "J_th_tit"]/@href ') for link in link_list:fulllink = "http://tieba.baidu.com" + Link # Combine links for each post #print link loadimage (fulllink) # Remove each picture in each post to connect def loadimage: headers = {"User-agent": "Mozilla /5.0 (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/54.0.2840.99 safari/537.36 "} request = Urllib.request.Request (link, headers = headers) HTML = urllib.request.urlopen (Request). Read () # parsing content = Etree. HTML (HTML) # Take out the collection of images connected to each layer in the post #link_list = Content.xpath ('//img[@class = "Bde_image"]/@src ') #link_list = Content.xpath ('//div[@class = "Post_bubble_middle"] link_list = Content.xpath ('//img[@class = "Bde_image"]/@src ') # Remove the connection for each picture for link in link_list:print (link) writeimage def writeimage: "" function: Writes HTML content to the local link: Picture connection "" "#pri NT "Saving" + filename headers = {"User-agent": "mozilla/5.0 (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/54.0.2840.99 safari/537.36 "} # File Write request = Urllib.request.Request ( Link, headers = headers) # picture raw Data image = Urllib.request.urlopen (request). Read () # Take out the connection after 10 bits as file name filename = link[-10:] # Write into the local disk file with open ("d:\image\\" +filename, "WB") as F:f.write (image) print ("downloaded successfully" + filename) def tiebaspider (URL, begi Npage, EndPage): "" "Role: Paste the crawler Scheduler, responsible for the combination of processing each page URL URL: Paste the first part of the URL beginpage: Start page endpage: End Page" "" for page in range (Beginpa GE, EndPage + 1): PN = (page-1) * # filename = "+ str (page) +" page. html "FUllurl = URL + "&pn=" + str (PN) print (fullurl) loadPage (fullurl) #print HTML print ("Thank you") if __name__ = = "__main__": KW =input ("Please enter the bar name to crawl:") beginpage = Int (Input ("Enter start Page:")) endpage = Int (Input ("Enter end page:")) URL = "Http://tieba.baidu.co M/f? "key = Urllib.parse.urlencode ({" kw ": kw}) FullUrl = URL + key Tiebaspider (FullUrl, Beginpage, EndPage)
Python3 crawler Baidu Paste bar