began to learn python, on the internet to find a grilled embarrassing encyclopedia of the essence of the crawler, their own slight modification of a bit, you can normal BA La embarrassing hundred Essence, others code dare not exclusive, posted out to share to everyone, nonsense No on the code:
# -*- coding: utf-8 -*- import urllib2 import urllib import re import thread import time #----------- Load handling embarrassing encyclopedia ----------- class Spider_Model: def __init__ (self): self.page = 1 self.pages = [] self.enable = False # clip All the jokes, add them to the list, and return to the list def getpage (self,page): myurl = "http://www.qiushibaike.com/hot/page/" + page myurl = myurl + "s=4771468" user_agent = " mozilla/5.0 (windows nt 6.1; rv:37.0) gecko/20100101 firefox/37.0 ' headers = { ' User-agent ' : user_agent } #print myUrl req = urllib2. Request (myurl, headers = headers) myresponse = urllib2.urlopen (req) myPage = Myresponse.read () #encode的作用是将unicode编码转换成其他编码的字符串 #decode的作用是将其他编码的字符串转换成unicode编码 unicodepage = mypage.decode ("Utf-8") # Find all class= "content" div tags #re. S is any matching pattern, that is. Can match line break myitems = re.findall (' <div.*?class= ' content > (. *?) </div> ', Unicodepage,re. S) items = [] for item in myItems: # item is the content of Div, That is, the content of the essence of embarrassing encyclopedia # item remove the line break items.append (Item.replace ("\ n", ""). Replace ("<br/>", "")) return items For # To load new satin def loadpage (self): # Run while If the user does not enter quit self.enable: # If the contents of the pages array are less than 2 if Len (self.pages) < 2: try: # get new pages in the jokes Mypage = self. GetPage (str (self.page)) self.page += 1 self.pages.append ( MyPage) except: print ' can't link embarrassing encyclopedia! ' else: time.sleep (1) def showpage ( Self,nowpage,page): for items in nowpage: print u ' page%d ' % page , items myinput = raw_input () if myInput == "Quit": self.enable = false break def Start (self): self.enable = true page = self.page print u ' Loading Please wait ... ' # Create a new thread to load the satin in the background and store thread.start_new_thread (self. LoadPage, ()) #----------- Load handling embarrassing encyclopedia ----------- while self.enable: # if the Self's page array contains elements if self.pages: nowPage = self.pages[0] del self.pages[0] &nBsp; self. ShowPage (nowpage,page) page += 1 #----------- Program entrance - ---------- print u "" --------------------------------------- Program: Embarrassing reptile version:0.3 original why modified by: Tian Q-China Date:2015-05-12 language:python 2.7 action: input quit quit reading embarrassing encyclopedia function: Press ENTER to browse today's embarrassing hot spots --------------------------------------- "" " Print u ' Please press ENTER to view today's embarrassing content: ' raw_input (' ') mymodel = spider_model () mymodel.start ()
The original code author's comments are clear, I am not long-winded. That is true.
2015-05-12 Python crawler Learning