Python crawler crawling embarrassing encyclopedia content

Source: Internet
Author: User

Refer to the online tutorial to make changes, crawl embarrassing encyclopedia jokes (removal of pictures), details see below Source:

#coding =utf-8
#!/usr/bin/python
Import Urllib
Import Urllib2
Import re
Import Thread
Import time
Import Sys

#定义要抓取的网页
#url = ' http://www.qiushibaike.com/hot/'
#读取要抓取的网页
#globalcontent = Urllib.urlopen (url). Read ()
#抓取段子内容
#new_inner01_h = globalcontent.find (' <div class= ' content ' <span> ')
#print New_inner01_h
#设置系统编码utf-8
Reload (sys)
Sys.setdefaultencoding ("utf-8")

#加载处理糗事百科
Class Spider_model:
"" "docstring for spider_model" ""
def __init__ (self):
Self.page = 1
Self.pages = []
Self.enable = False

#抓取该页面的所有段子, adding to the list and returning the list
def GetPage (self,page):
Try
Myurl = "http://m.qiushibaike.com/hot/page/" + str (page)
User_agent = ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/50.0.2661.102 safari/537.36 '
headers = {' user-agent ': user_agent}
req = urllib2. Request (myurl,headers = Headers)
Myresponse = Urllib2.urlopen (req)
MyPage = Myresponse.read (). decode (' utf-8 ')
Return MyPage
#decode的作用将其他编码的字符串转换成Unicode编码

Except urllib2. urlerror, E:
If Hasattr (e, "code"):
Print U "link embarrassing encyclopedia failure, cause of error", E.code
If Hasattr (e, "reason"):
Print U "link embarrassing encyclopedia failure, cause of error", E.reason
Return None

#传入某一页代码, return to the list of jokes without pictures on this page
def Getpageitems (self,page):
MyPage = Self. GetPage (page)
If not mypage:
Print "page failed to load ..."
Return None
#正则匹配需要获取的段子
Pattern = re.compile (' <div.*?class= ' author.*?clearfix ". *?>.*?<a.*?</a>.*?<a.*? title=" (. *?) " >.*?</a>.*?<div.*?class ' +
' = ' content ' .*?>.*?<span.*?> (. *?) </span>.*?</a> (. *?) <div.*?class= "stats.*?>.*?<i.*?class=" number "> (. *?) </i> ', Re. S
# pattern = Re.compile ('. *? *?) (.*?). *?. *? (.*?). *?. *?. *? (.*?). *?. *? ', Re. S
#pattern = Re.compile (' (. *?) (.*?). *? ', Re. S
#pattern = re.compile (r '. *?<a.*?<a.*? (. *?). *? (.*?) <!. *?. *? (. *?) ', Re. S
Items = Re.findall (pattern,mypage)

#用来存储每页的段子们
Pagestroies = []
#遍历正则表达式匹配的信息
For item in Items:
#判断是否包含图片
haveimg = Re.search ("img", item[2])

#如果不含有图片, add it to the List.
If not haveimg:
#item [0] is the publisher, item[1] is the content, item[3] is a few likes

If ' <br/> ' in item:
New_item = re.sub (r ' <br.*?> ', ' \ n ', item)
Pagestroies.append ([new_item[0].strip (), new_item[1].strip (), new_item[3].strip ()])
Else
Pagestroies.append ([item[0].strip (), item[1].strip (), item[3].strip ()])
Return pagestroies


#找出所有class = "content" of the div tag
#re. S is any matching pattern, that is. you can match the line break
#myItems = re.find (' <div.*?class= ' content > (. *?) </div> ', Unicodepage,re. S)
#items = []
#for item in Myitems:
#item [0] is the title of the Div
#item [1] is the content of the Div
#items. append ([item[0].replace ( "\ n", ""), item[1].replace ("\ n"), ""])
#items = item[0]
#return items

#加载并提取页面的内容, Add to List
def LoadPage (self):
#如果用户未输入quit则一直运行
#while self.enable:
If self.enable = = True:
#如果当前未看的页数小于2也, Load a new page
If Len (self.pages) < 2:
#获取新一页
pagestroies = Self.getpageitems (self.page)
#将该页的段子存放到全局list中
If pagestroies:
self.pages.append (pagestroies)
#获取完之后页面索引加一, indicating next page read
Self.page + = 1
#try:
#获取新的页面中的段子
#myPage = self. GetPage (str (self.page))
#self. page + = 1
#self. pages.append (mypage)
#except:
#print ' can't link to the embarrassing encyclopedia '
# Else:
#time. sleep (1)
#调用该方法, each time you hit enter to print out a satin
def showpage (self,pagestroies,page):
#遍历一页的段子
for items In pagestroies:
#等待用户输入
input = raw_input ()
#每次输入回车 to determine if you want to load the new page
Self. LoadPage ()

#输入输入Q则程序结束
If input = = "Q":
Self.enable = False
return
print u "page%d \ n publisher:%s\t\n published:% s\n likes:%s\n "% (page,items[0],items[1],items[2])


def Start (self):
Print U "is reading embarrassing encyclopedia, press ENTER see new jokes, Q quit"
#初始化变量为True, The program executes normally
Self.enable = True
#先加载一页内容
Self. LoadPage ()
#局部变量, control is currently reading the first page
Nowpage = 0
While Self.enable:
If Len (self.pages) >0:
#从全局list中获取一页的段子
Pagestroies = self.pages[0]
#当前读到的页数加一
Nowpage + = 1
#将全局list中第一个元素删除 because it has been removed
Del self.pages[0]
#输出该页的段子
Self. ShowPage (pagestroies,nowpage)
‘‘‘
page = Self.page

#新建一个线程在后台加载段子存储
Thread.start_new_thread (self. LoadPage, ())

#加载处理糗事百科
While Self.enable:
#如果self的page数组中存有元素
If Self.pages:
Nowpage = self.pages[0]
Del self.pages[0]
Self. ShowPage (nowpage,page)
Page + = 1


Print U ' Please press ENTER to view Today's embarrassing Encyclopedia: '
Raw_input (")
‘‘‘
MyModel = Spider_model ()
Mymodel.start ()

Python crawler crawling embarrassing encyclopedia content

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.