Python static Web Crawler xpath (simple blog update reminder function), pythonxpath
Directly run the Code:
#! /Usr/bin/env python3 # antuor: Alan #-*-coding: UTF-8-*-import requestsfrom lxml import etreeimport datetime, timeimport osclass xxoohelper (object ): # Easy read def _ init _ (self): self. url = 'HTTP: // www.cnblogs.com/alan-babyblog/' # initialize def getSource (self): html = requests. get (self. url ). content # content is better than text. One response is byte, And the other response is str return html def getContent (self, html): # large first, small selector = etree. HTML (html) title = selector. xpath ('// div [1]/div [2]/a/text ()') [0]. strip () # extract text content = selector from the list. xpath ('// div [1]/div [2]/div [1]/div [1]/div [3]/div/text ()') [0]. strip () post_time = selector. xpath ('// div [1]/div [2]/div [1]/div [1]/div [5]/text ()') [0]. strip () send_text = title + content + post_time # The type is str return send_text def tosave (self, text): with open('myblog.txt ', 'A') as f: f. write ('{0} \ n '). format (text) # wrap def tocheck (self, data): if not OS .path.exists('myblog.txt '): # determine whether a file exists return True else: with open ('myblog.txt', 'R ') as f: existblog = f. readlines () # print (data + '\ n') if data +' \ n' in existblog: # determine whether the recorded content has been returned False else: return Trueif _ name _ = '_ main _': # program entry helper = xxoohelper () # instantiate while True: # while loop continuously monitoring page source = helper. getSource () content = helper. getContent (source) if helper. tocheck (content): post_time = str (datetime. datetime. now () print (post_time, 'new content \ n', content) helper. tosave (content) else: print ('scanning ...... ') pass time. sleep (30)