# Requests Module to request page
# lxml Module HTML build selector selector (formatted response response)
# from lxml import HTML
# import Requests
# response = requests.get (URL). Content
# selector = html.formatstring (response)
# hrefs = Selector.xpath ('/html/body//div[@class = ' Feed-item _j_feed_item ']/a/@href ')
# with URL = ' https://www.mafengwo.cn/gonglve/ziyouxing/2033.html ' as an example
# python 2.7import requestsfrom lxml import htmlimport OS
1 # Get the URL of a child page 2 def get_page_urls (URL): 3 response = requests.get (URL). content4 # Build selector 5 from lxml html Selector = html.fromstring (response) 6 urls = []7 for I in Selector.xpath ("/html/body//div[@class = ' Feed-item _j _feed_item ']/a/@href "): 8 urls.append (i) 9 return URLs
1 # Get title from a child's HTML (div[@class = ' title ') 2 def get_page_a_title (URL): 3 "' URL is ziyouxing ' A@href" ' 4
response = Requests.get (URL). content5 selector = html.fromstring (response) 6 # Get XPath by Chrome ' s tool< c11/>--> /html/body//div[@class = ' title ']/text () 7 a_title = Selector.xpath ("/html/body//div[@class = ') Title ']/text () ") 8 return A_title
1 # Get the page selector (built via lxml HTML) 2 def get_selector (URL): 3 response = requests.get (URL). Content 4 selector = html.fr Omstring (response) 5 return Selector
# After analyzing the HTML page structure with Chrome's developer tools, we find that the text content we need to get is mainly shown in div[@class = ' l-topic ') and div[@class = ' p-section ')
1 # Get the required text content 2 def get_page_content (selector): 3 #/html/body/div[2]/div[2]/div[1]/div[@class = ' L-topic ']/p/text () 4 page_title = Selector.xpath ("//div[@class = ' l-topic ']/p/text ()") 5 #/html/body/div[2]/ div[2]/div[1]/div[2]/div[15]/div[@class = ' p-section ']/text () 6 page_content = Selector.xpath ("//div[@class = ' P-section ']/text () ") 7 return page_title,page_content
1 # Get Picture in page URL address 2 def get_image_urls (selector): 3 Imagesrcs = Selector.xpath ("//img[@class = ' _j_lazyload ']/@src") 4 return Imagesrcs
# get the title of the picture
1 def get_image_title (selector, num) 2 # num is 3 url = "/html/body/div[2]/div[2]/div[1]/div[2]/div[" +num+ "starting from 2" ]/span[@class = ' Img-an ']/text () "4 if Selector.xpath (URL) is not none:5 image_title = Selector.xpath (URL) 6 else:7 image_title = "map" +str (num) # does not start with a 8 return Image_title
# download Images
1 def downloadimages (selector,number): 2 "number is used to count the ' 3 urls = Get_image_urls () 4 num = 2 5 Amount = Len (URLs) 6 for the URL in urls:7 image_title = get_image_title (selector, num) 8 filename = "/home/workspace/to Ur/words/result "+number+"/+ "image_title+". jpg "9 if not os.path.exists (filename): ten os.makedirs (filename) print (' Downloading%s image%s '% (number, image_title)) with open (filename, ' WB ') as F:13 F.write ( Requests.get (URL). content) num + + + + print "%s" has been downloaded "%num
# entry, start and save the retrieved data to the file if __name__ = = ' __main__ ': url = ' urls = get_page_urls (URL) # turn to get response from htmlnumber = 1for i in URLs: selector = Get_selector (i) # download images downloadimages (selector,number) # get text and write I Nto a filepage_title, page_content = get_page_content (selector) result = Page_title+ ' \ n ' +page_content+ ' \ n ' path = "/home/workspace/tour/words/result" +num+ "/" If not os.path.exists (filename): os.makedirs (filename) filename = path + "num" + ". txt" with open (filename, ' WB ') as F: f.write (result) Print result
To this end of the crawler, crawling page must carefully analyze the HTML structure, some pages are generated by JS, the page is relatively simple, does not involve the processing of JS, future essays will be related to share