A recent study of Python crawler, according to online data to achieve the Python crawler crawling embarrassing encyclopedia, make a note.
Share several learning Python crawler materials:
The Liaoche python tutorial focuses on Python's basic programming knowledge
Python develops a simple crawler to explain the whole structure of the Python crawler through an example
Python regular expressions explain the regular expressions needed in a reptile match
Python Crawler Series Tutorials several examples of training
The structure of a simple crawler
The running process of the crawler
Following the tutorials in the implementation of the Python crawler crawling embarrassing encyclopedia
New Project
New Pydev Package
New Pydev Module
Edit code:
Import URLLIB2 import re class qsbk:def __init__ (self): self.pageindex =2 self.user_agent = ' Mo zilla/4.0 (compatible; MSIE 5.5; Windows NT) ' self.headers = {' user-agent ': self.user_agent} self.stories = [] self.enable = Fa LSE DEF GetPage (SELF,PAGEINDEX): Try:url = ' http://www.qiushibaike.com/hot/page/' + str (page Index) Request = Urllib2. Request (Url,headers = self.headers) response = Urllib2.urlopen (request) Pagecode = Response.read () . Decode (' Utf-8 ') return Pagecode except Urllib2. Urlerror, E:if hasattr (E, "Reason"): Print U "connection embarrassing encyclopedia failure, wrong reason", E.reason return No
Ne def getpageitems (self,pageindex): Pagecode = Self.getpage (pageIndex) if not pagecode: Print page load failed ... "return to None pattern = Re.compile (R ' <div.*?class=" author.*?>.*?& Lt;a.*?</a>.*?<a.*?> (. *?) </a>.*?<div.*?class ' + ' = "content" .*?> (. *?) </div> (. *?) <div class= "stats.*?class=" Number "> (. *?) </i> ', Re. S) items = Re.findall (pattern,pagecode) pagestories = [] for item in ITEMS:REPLACEBR = Re.compile (' <br/> ') Text = Re.sub (REPLACEBR, "\ n", item[1]) pagestories.append ([Item[0].stri P (), Text.strip (), Item[2].strip (), Item[3].strip ()]) return pagestories def loadpage (self): if Self.en
able = true:if Len (self.stories) < 2:pagestories = Self.getpageitems (Self.pageindex)
If PageStories:self.stories.append (pagestories) Self.pageindex + + 1
def getonestory (self,pagestories,page): For story in pagestories:input = Raw_input () Self.loadpage () if input = = "Q": self.enable = False Return Print U "page%d \ t publisher:%s\t:%s\n%s"% (Page,story[0],story[3],story[1]) def start
(self): print U "reading embarrassing encyclopedia, press ENTER to view new jokes, q exit" self.enable = True self.loadpage () nowpage = 0
While Self.enable:if Len (self.stories) >0:pagestories = self.stories[0] Nowpage + + 1 del self.stories[0] self.getonestory (pagestories,nowpage) spider = QSBK () Spider.start ()
Run Result:
For different Web pages, right-click "Review element" to view the page code and modify the regular expression