Case: Crawler using XPath
Now we use XPath to make a simple crawler, we try to crawl all the posts in a bar, and download the images from each floor of the post to local.
# tieba_xpath.py#!/usr/bin/env python#-*-Coding:utf-8-*-Import OSImport UrllibImport Urllib2From lxmlImport etreeClassSpider:Def__init__(self): Self.tiebaname = Raw_input ("Please need to visit the bar:") self.beginpage = Int (Raw_input ("Please enter the start page:") self.endpage = Int (Raw_input ("Please enter the termination page:") Self.url =' http://tieba.baidu.com/f ' Self.ua_header = {"User-agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 trident/5.0; "}# picture Number Self.username =1DefTiebaspider(self):For pageIn range (Self.beginpage, Self.endpage +1): PN = (Page-1) *50# page Number word = {' PN ': pn,' kw ': self.tiebaname} word = Urllib.urlencode (word)#转换成url编码格式 (string) Myurl = Self.url +"?" + Word# example: http://tieba.baidu.com/f? KW=%E7%BE%8E%E5%A5%B3 & Pn=50# Invoke page handler function load_page# and get links to all posts on page, links = self.loadpage (Myurl)# urllib2_test3.py# Read Page contentDefLoadPage(Self, url): req = urllib2. Request (url, headers = self.ua_header) HTML = Urllib2.urlopen (req). Read ()# Parse HTML for HTML document SELECTOR=ETREE. HTML (HTML)#抓取当前页面的所有帖子的url的后半部分, which is the post number# "p/4884069807" links in http://tieba.baidu.com/p/4884069807 = Selector.xpath ('//div[@class = "Threadlist_lz clearfix"]/div/a/@href ')# links type is etreeelementstring list# Iterate through the list and merge into a post address, call the picture processing function loadimageFor linkIn Links:link ="http://tieba.baidu.com" + link self.loadimages (link)# Get PicturesDefLoadimages(Self, link): req = urllib2. Request (link, headers = self.ua_header) HTML = Urllib2.urlopen (req). Read () selector = etree. HTML (HTML)# get the src path of all images in this post imageslinks = Selector.xpath ('//img[@class = ' bde_image ']/@src ')# Take the picture path in turn, download saveFor ImageslinkIn ImagesLinks:self.writeImages (Imageslink)# save page ContentDefWriteimages(Self, Imageslink):"" to deposit binary contents of images into the Usernname file "print imageslink print "storing file%d ... "% self.username # 1. Opens the file, returns a file object, filename = open ( ' WB ') # 2. Get the contents of the picture images = Urllib2.urlopen (imageslink). Read () # 3. Call the File object write () method, and the Page_ The contents of the HTML are written to the file File.write (images) # 4. Finally close the file File.close () # Counter self-increment 1 self.username + = 1# simulate main function if __name__ = "__main__": # First create the Crawler object Myspider = Spider () # Call the Crawler object method, start working Myspider.tiebaspider ()
Python XML instance