Refer to the online tutorial to make changes, crawl embarrassing encyclopedia jokes (removal of pictures), details see below Source:
#coding =utf-8
#!/usr/bin/python
Import Urllib
Import Urllib2
Import re
Import Thread
Import time
Import Sys
#定义要抓取的网页
#url = ' http://www.qiushibaike.com/hot/'
#读取要抓取的网页
#globalcontent = Urllib.urlopen (url). Read ()
#抓取段子内容
#new_inner01_h = globalcontent.find (' <div class= ' content ' <span> ')
#print New_inner01_h
#设置系统编码utf-8
Reload (sys)
Sys.setdefaultencoding ("utf-8")
#加载处理糗事百科
Class Spider_model:
"" "docstring for spider_model" ""
def __init__ (self):
Self.page = 1
Self.pages = []
Self.enable = False
#抓取该页面的所有段子, adding to the list and returning the list
def GetPage (self,page):
Try
Myurl = "http://m.qiushibaike.com/hot/page/" + str (page)
User_agent = ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/50.0.2661.102 safari/537.36 '
headers = {' user-agent ': user_agent}
req = urllib2. Request (myurl,headers = Headers)
Myresponse = Urllib2.urlopen (req)
MyPage = Myresponse.read (). decode (' utf-8 ')
Return MyPage
#decode的作用将其他编码的字符串转换成Unicode编码
Except urllib2. urlerror, E:
If Hasattr (e, "code"):
Print U "link embarrassing encyclopedia failure, cause of error", E.code
If Hasattr (e, "reason"):
Print U "link embarrassing encyclopedia failure, cause of error", E.reason
Return None
#传入某一页代码, return to the list of jokes without pictures on this page
def Getpageitems (self,page):
MyPage = Self. GetPage (page)
If not mypage:
Print "page failed to load ..."
Return None
#正则匹配需要获取的段子
Pattern = re.compile (' <div.*?class= ' author.*?clearfix ". *?>.*?<a.*?</a>.*?<a.*? title=" (. *?) " >.*?</a>.*?<div.*?class ' +
' = ' content ' .*?>.*?<span.*?> (. *?) </span>.*?</a> (. *?) <div.*?class= "stats.*?>.*?<i.*?class=" number "> (. *?) </i> ', Re. S
# pattern = Re.compile ('. *? *?) (.*?). *?. *? (.*?). *?. *?. *? (.*?). *?. *? ', Re. S
#pattern = Re.compile (' (. *?) (.*?). *? ', Re. S
#pattern = re.compile (r '. *?<a.*?<a.*? (. *?). *? (.*?) <!. *?. *? (. *?) ', Re. S
Items = Re.findall (pattern,mypage)
#用来存储每页的段子们
Pagestroies = []
#遍历正则表达式匹配的信息
For item in Items:
#判断是否包含图片
haveimg = Re.search ("img", item[2])
#如果不含有图片, add it to the List.
If not haveimg:
#item [0] is the publisher, item[1] is the content, item[3] is a few likes
If ' <br/> ' in item:
New_item = re.sub (r ' <br.*?> ', ' \ n ', item)
Pagestroies.append ([new_item[0].strip (), new_item[1].strip (), new_item[3].strip ()])
Else
Pagestroies.append ([item[0].strip (), item[1].strip (), item[3].strip ()])
Return pagestroies
#找出所有class = "content" of the div tag
#re. S is any matching pattern, that is. you can match the line break
#myItems = re.find (' <div.*?class= ' content > (. *?) </div> ', Unicodepage,re. S)
#items = []
#for item in Myitems:
#item [0] is the title of the Div
#item [1] is the content of the Div
#items. append ([item[0].replace ( "\ n", ""), item[1].replace ("\ n"), ""])
#items = item[0]
#return items
#加载并提取页面的内容, Add to List
def LoadPage (self):
#如果用户未输入quit则一直运行
#while self.enable:
If self.enable = = True:
#如果当前未看的页数小于2也, Load a new page
If Len (self.pages) < 2:
#获取新一页
pagestroies = Self.getpageitems (self.page)
#将该页的段子存放到全局list中
If pagestroies:
self.pages.append (pagestroies)
#获取完之后页面索引加一, indicating next page read
Self.page + = 1
#try:
#获取新的页面中的段子
#myPage = self. GetPage (str (self.page))
#self. page + = 1
#self. pages.append (mypage)
#except:
#print ' can't link to the embarrassing encyclopedia '
# Else:
#time. sleep (1)
#调用该方法, each time you hit enter to print out a satin
def showpage (self,pagestroies,page):
#遍历一页的段子
for items In pagestroies:
#等待用户输入
input = raw_input ()
#每次输入回车 to determine if you want to load the new page
Self. LoadPage ()
#输入输入Q则程序结束
If input = = "Q":
Self.enable = False
return
print u "page%d \ n publisher:%s\t\n published:% s\n likes:%s\n "% (page,items[0],items[1],items[2])
def Start (self):
Print U "is reading embarrassing encyclopedia, press ENTER see new jokes, Q quit"
#初始化变量为True, The program executes normally
Self.enable = True
#先加载一页内容
Self. LoadPage ()
#局部变量, control is currently reading the first page
Nowpage = 0
While Self.enable:
If Len (self.pages) >0:
#从全局list中获取一页的段子
Pagestroies = self.pages[0]
#当前读到的页数加一
Nowpage + = 1
#将全局list中第一个元素删除 because it has been removed
Del self.pages[0]
#输出该页的段子
Self. ShowPage (pagestroies,nowpage)
‘‘‘
page = Self.page
#新建一个线程在后台加载段子存储
Thread.start_new_thread (self. LoadPage, ())
#加载处理糗事百科
While Self.enable:
#如果self的page数组中存有元素
If Self.pages:
Nowpage = self.pages[0]
Del self.pages[0]
Self. ShowPage (nowpage,page)
Page + = 1
Print U ' Please press ENTER to view Today's embarrassing Encyclopedia: '
Raw_input (")
‘‘‘
MyModel = Spider_model ()
Mymodel.start ()
Python crawler crawling embarrassing encyclopedia content