Python crawler crawling embarrassing encyclopedia content

Last Update:2016-11-19 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Refer to the online tutorial to make changes, crawl embarrassing encyclopedia jokes (removal of pictures), details see below Source:

#coding =utf-8
#!/usr/bin/python
Import Urllib
Import Urllib2
Import re
Import Thread
Import time
Import Sys

#定义要抓取的网页
#url = ' http://www.qiushibaike.com/hot/'
#读取要抓取的网页
#globalcontent = Urllib.urlopen (url). Read ()
#抓取段子内容
#new_inner01_h = globalcontent.find (' <div class= ' content ' <span> ')
#print New_inner01_h
#设置系统编码utf-8
Reload (sys)
Sys.setdefaultencoding ("utf-8")

#加载处理糗事百科
Class Spider_model:
"" "docstring for spider_model" ""
def __init__ (self):
Self.page = 1
Self.pages = []
Self.enable = False

#抓取该页面的所有段子, adding to the list and returning the list
def GetPage (self,page):
Try
Myurl = "http://m.qiushibaike.com/hot/page/" + str (page)
User_agent = ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/50.0.2661.102 safari/537.36 '
headers = {' user-agent ': user_agent}
req = urllib2. Request (myurl,headers = Headers)
Myresponse = Urllib2.urlopen (req)
MyPage = Myresponse.read (). decode (' utf-8 ')
Return MyPage
#decode的作用将其他编码的字符串转换成Unicode编码

Except urllib2. urlerror, E:
If Hasattr (e, "code"):
Print U "link embarrassing encyclopedia failure, cause of error", E.code
If Hasattr (e, "reason"):
Print U "link embarrassing encyclopedia failure, cause of error", E.reason
Return None

#传入某一页代码, return to the list of jokes without pictures on this page
def Getpageitems (self,page):
MyPage = Self. GetPage (page)
If not mypage:
Print "page failed to load ..."
Return None
#正则匹配需要获取的段子
Pattern = re.compile (' <div.*?class= ' author.*?clearfix ". *?>.*?<a.*?</a>.*?<a.*? title=" (. *?) " >.*?</a>.*?<div.*?class ' +
' = ' content ' .*?>.*?<span.*?> (. *?) </span>.*?</a> (. *?) <div.*?class= "stats.*?>.*?<i.*?class=" number "> (. *?) </i> ', Re. S
# pattern = Re.compile ('. *? *?) (.*?). *?. *? (.*?). *?. *?. *? (.*?). *?. *? ', Re. S
#pattern = Re.compile (' (. *?) (.*?). *? ', Re. S
#pattern = re.compile (r '. *?<a.*?<a.*? (. *?). *? (.*?) <!. *?. *? (. *?) ', Re. S
Items = Re.findall (pattern,mypage)

#用来存储每页的段子们
Pagestroies = []
#遍历正则表达式匹配的信息
For item in Items:
#判断是否包含图片
haveimg = Re.search ("img", item[2])

#如果不含有图片, add it to the List.
If not haveimg:
#item [0] is the publisher, item[1] is the content, item[3] is a few likes

If ' <br/> ' in item:
New_item = re.sub (r ' <br.*?> ', ' \ n ', item)
Pagestroies.append ([new_item[0].strip (), new_item[1].strip (), new_item[3].strip ()])
Else
Pagestroies.append ([item[0].strip (), item[1].strip (), item[3].strip ()])
Return pagestroies

#找出所有class = "content" of the div tag
#re. S is any matching pattern, that is. you can match the line break
#myItems = re.find (' <div.*?class= ' content > (. *?) </div> ', Unicodepage,re. S)
#items = []
#for item in Myitems:
#item [0] is the title of the Div
#item [1] is the content of the Div
#items. append ([item[0].replace ( "\ n", ""), item[1].replace ("\ n"), ""])
#items = item[0]
#return items

#加载并提取页面的内容, Add to List
def LoadPage (self):
#如果用户未输入quit则一直运行
#while self.enable:
If self.enable = = True:
#如果当前未看的页数小于2也, Load a new page
If Len (self.pages) < 2:
#获取新一页
pagestroies = Self.getpageitems (self.page)
#将该页的段子存放到全局list中
If pagestroies:
self.pages.append (pagestroies)
#获取完之后页面索引加一, indicating next page read
Self.page + = 1
#try:
#获取新的页面中的段子
#myPage = self. GetPage (str (self.page))
#self. page + = 1
#self. pages.append (mypage)
#except:
#print ' can't link to the embarrassing encyclopedia '
# Else:
#time. sleep (1)
#调用该方法, each time you hit enter to print out a satin
def showpage (self,pagestroies,page):
#遍历一页的段子
for items In pagestroies:
#等待用户输入
input = raw_input ()
#每次输入回车 to determine if you want to load the new page
Self. LoadPage ()

#输入输入Q则程序结束
If input = = "Q":
Self.enable = False
return
print u "page%d \ n publisher:%s\t\n published:% s\n likes:%s\n "% (page,items[0],items[1],items[2])

def Start (self):
Print U "is reading embarrassing encyclopedia, press ENTER see new jokes, Q quit"
#初始化变量为True, The program executes normally
Self.enable = True
#先加载一页内容
Self. LoadPage ()
#局部变量, control is currently reading the first page
Nowpage = 0
While Self.enable:
If Len (self.pages) >0:
#从全局list中获取一页的段子
Pagestroies = self.pages[0]
#当前读到的页数加一
Nowpage + = 1
#将全局list中第一个元素删除 because it has been removed
Del self.pages[0]
#输出该页的段子
Self. ShowPage (pagestroies,nowpage)
‘‘‘
page = Self.page

#新建一个线程在后台加载段子存储
Thread.start_new_thread (self. LoadPage, ())

#加载处理糗事百科
While Self.enable:
#如果self的page数组中存有元素
If Self.pages:
Nowpage = self.pages[0]
Del self.pages[0]
Self. ShowPage (nowpage,page)
Page + = 1

Print U ' Please press ENTER to view Today's embarrassing Encyclopedia: '
Raw_input (")
‘‘‘
MyModel = Spider_model ()
Mymodel.start ()

Python crawler crawling embarrassing encyclopedia content

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python crawler crawling embarrassing encyclopedia content

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Python crawler crawling embarrassing encyclopedia content

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support