Requests and lxml example tutorials for implementing crawlers

Source: Internet
Author: User
Tags xpath
# Requests Module to request page
# lxml Module HTML build selector selector (formatted response response)
# from lxml import HTML
# import Requests

# response = requests.get (URL). Content

# selector = html.formatstring (response)

# hrefs = Selector.xpath ('/html/body//div[@class = ' Feed-item _j_feed_item ']/a/@href ')

# with URL = ' https://www.mafengwo.cn/gonglve/ziyouxing/2033.html ' as an example

# python 2.7import requestsfrom lxml import htmlimport OS

1 # Get the URL of a child page 2 def get_page_urls (URL): 3     response = requests.get (URL). content4     # Build selector 5     from lxml html Selector = html.fromstring (response) 6     urls = []7 for     I in Selector.xpath ("/html/body//div[@class = ' Feed-item _j _feed_item ']/a/@href "): 8         urls.append (i) 9     return URLs
1 # Get title from a child's HTML (div[@class = ' title ') 2 def get_page_a_title (URL): 3 "'     URL is ziyouxing ' A@href" ' 4
  response = Requests.get (URL). content5     selector = html.fromstring (response) 6     # Get XPath by Chrome ' s tool< c11/>-->  /html/body//div[@class = ' title ']/text () 7     a_title = Selector.xpath ("/html/body//div[@class = ') Title ']/text () ") 8     return A_title
1 # Get the page selector (built via lxml HTML) 2 def get_selector (URL): 3     response = requests.get (URL). Content 4     selector = html.fr Omstring (response) 5     return Selector
# After analyzing the HTML page structure with Chrome's developer tools, we find that the text content we need to get is mainly shown in div[@class = ' l-topic ') and div[@class = ' p-section ')
1  # Get the required text content 2  def get_page_content (selector): 3      #/html/body/div[2]/div[2]/div[1]/div[@class = ' L-topic ']/p/text () 4      page_title = Selector.xpath ("//div[@class = ' l-topic ']/p/text ()") 5      #/html/body/div[2]/ div[2]/div[1]/div[2]/div[15]/div[@class = ' p-section ']/text () 6      page_content = Selector.xpath ("//div[@class = ' P-section ']/text () ") 7      return page_title,page_content
1 # Get Picture in page URL address 2 def get_image_urls (selector): 3     Imagesrcs = Selector.xpath ("//img[@class = ' _j_lazyload ']/@src") 4     return Imagesrcs
  # get the title of the picture
1 def get_image_title (selector, num) 2     # num is 3     url = "/html/body/div[2]/div[2]/div[1]/div[2]/div[" +num+ "starting from 2" ]/span[@class = ' Img-an ']/text () "4     if Selector.xpath (URL) is not none:5         image_title = Selector.xpath (URL) 6     else:7         image_title = "map" +str (num) # does not start with a 8     return Image_title
  # download Images
1 def downloadimages (selector,number): 2     "number is used to count the ' 3     urls = Get_image_urls () 4     num = 2 5     Amount = Len (URLs) 6 for the     URL         in urls:7 image_title = get_image_title (selector, num) 8         filename = "/home/workspace/to Ur/words/result "+number+"/+ "image_title+". jpg "9         if not os.path.exists (filename): ten             os.makedirs (filename)         print (' Downloading%s image%s '% (number, image_title)) with         open (filename, ' WB ') as F:13             F.write ( Requests.get (URL). content)         num + + + +     print "%s" has been downloaded "%num

# entry, start and save the retrieved data to the file if __name__ = = ' __main__ ':    url = ' urls = get_page_urls (URL) # turn to get response from htmlnumber = 1for i in URLs:        selector = Get_selector (i) # download images      downloadimages (selector,number) # get text and write I Nto a filepage_title, page_content = get_page_content (selector)        result = Page_title+ ' \ n ' +page_content+ ' \ n ' path = "/home/workspace/tour/words/result" +num+ "/" If not os.path.exists (filename):            os.makedirs (filename)        filename = path + "num" + ". txt" with open (filename, ' WB ') as F:            f.write (result) Print result

To this end of the crawler, crawling page must carefully analyze the HTML structure, some pages are generated by JS, the page is relatively simple, does not involve the processing of JS, future essays will be related to share

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.