Just tested the embarrassing reptile, the result of the next day embarrassing hundred of source code to change format = =
The regular expression is changed, but the HTML transcoding that exists in the content does not match, and does not affect the use of
#!-*-coding:utf-8-*-#! Usr/bin/python"' #=====================================================# filename:spider_qb.py# Describe: Download the joke from the embarrassing hundred and play it in turn # Mo difier:sunny# since:2015-04-20# Variable Description: Items,self.pages as list, in the form of: # [[' Time ', ' satin content ', ' time ', ' satin content ',...] #一行为一页内容 # ... # [' Time ', ' satin content ', ' time ', ' satin content ',...]] #===================================================== ""ImportUrllib2ImportRe,chardetImportThread,time class qiubaispider(object): "" " Embarrassing Encyclopedia crawler " " def __init__(self): #page为要显示的页码 #pages存储多页内容Self.page =1Self.pages = [] self.enable =False #下载1页内容 def getpage(self,page):Myurl ="http://m.qiushibaike.com/hot/page/"+ str (page) User_agent =' mozilla/4.0 (compatible; MSIE 5.5; Windows NT) 'headers = {' User-agent ': user_agent} req = Urllib2. Request (myurl,headers = headers) MyPage = Urllib2.urlopen (req). Read ()#添加编解码代码 #......Unicodepage = Mypage.decode ("Utf-8")#利用正则表达式匹配糗百内容, the pattern consists of two group #前两个元字符匹配空格等, not group #.* Match any number of characters, turn it into non-greedy mode #re. Dotall matches any content that includes ' \ n ' #新的糗百匹配正则 #content = re.compile (R ' <div.*?class= "Content" > (. *?) <!--([\s\s]{19})--(. *) </div> ', re. Dotall)Content = Re.findall (R ' <div.*?class= "Content" > (. *?) <!--([\s\s]{19}) (. *?) </div> ', Unicodepage,re. Dotall) items = []#content中pattern匹配的第1个group是title后的时间 #第2个group匹配 the content between <div> and </div>, that is, the content of the satin forIteminchContent#将内容中的换行符替换Items.append ([item[0].replace ("\ n",""), item[1].replace ("<br/>","") ])returnItems#缓存多页内容, the user has not entered quit and has been running def LoadPage(self): whileSelf.enable:#当前缓存的内容小于2页就开始加载 ifLen (self.pages) <2:Try: Temppage = self. GetPage (self.page) Self.page + =1Self.pages.append (Temppage)except:Print ' can't connect to embarrass hundred! ' Break Else:#缓存充足等待1秒Time.sleep (1) def showpage(self,nowpage,page): forItemsinchNowpage:Print u ' page%d '% page, items[1]Printitems[0] Myinput = Raw_input ()ifMyinput! ="": self.enable =False Break def Start(self):Self.enable =True #新建线程前把页码读出来, otherwise the page number value changes after the thread readspage = Self.page#后台开新线程缓存糗百内容 #参数1为线程函数, Parameter 2 is a parameter passed to the thread function where the empty tuple isThread.start_new_thread (self. LoadPage, ()) whileSelf.enable:#缓存区有内容 ifSelf.pages:#每次取出缓存区最前面一页内容来显示 #取出后删除缓存区对应内容Tempnowpage = self.pages[0]delself.pages[0] Self. ShowPage (tempnowpage,page) page + =1#-----------The entrance to the program-----------Print u "" "#=====================================================# filename:spider_qb.py# Describe: Download the joke from the embarrassing hundred and play it in turn # M odifier:sunny# since:2015-04-20# Variable Description: Items,self.pages as list, in the form of: # [[' Time ', ' satin content ', ' time ', ' satin content ',...] #一行为一页内容 # ... # [' Time ', ' satin content ', ' time ', ' satin content ',...]] #=====================================================---------------------------------------Program: Embarrassing hundred reptile version: 0.2 Reference: Why language: Python 2.7 modified: Before the cloud operation: Enter read, non-return exit--------------------------------------- "" "Print u ' Enter to browse embarrassing hot spots (any key exit): 'Raw_input ("') QB = Qiubaispider () QB. Start ()
[Python] embarrassing hot crawler v2.0 "15/4/21 Update"