Also refer to the online tutorial, write crawl post content, while the crawl of the post saved to the local document:
#!/usr/bin/python
#_ *_coding:utf-8_*_
Import Urllib
Import Urllib2
Import re
Import Sys
Reload (SYS)
Sys.setdefaultencoding ("Utf-8")
#处理页面标签, remove images, hyperlinks, line breaks, and more
Class Tool:
#去除img标签, 7-bit Sky lattice
removeimg = Re.compile (' | {7}| ')
#删除超链接标签
REMOVEADDR = Re.compile (' <a.*?>|</a> ')
#把换行的标签替换为 \ n
ReplaceLine = Re.compile (' <tr>|<div>|</div>|</p> ')
#把表格制表 <td> Replace with \ t
replacetd = Re.compile (' <td> ')
#把段落开头换为 \ n plus two spaces
Replacepara = Re.compile (' <p.*?> ')
#把换行符或双换行符替换为 \ n
Replacebr = Re.compile (' <br><br>|<br> ')
#将其余标签剔除
Removeet = Re.compile (' <.*?> ')
#去除匹配到Tool
def replace (self,x):
x = Re.sub (Self.removeimg, "", X)
x = Re.sub (Self.removeaddr, "", X)
#x = Re.sub (self.replaceline, "\ n", X)
#x = Re.sub (self.replacetd, "\ T", X)
#x = Re.sub (Self.replacepara, "\ n", X)
#x = Re.sub (Self.replacebr, "\ n", X)
x = Re.sub (Self.removeet, "", X)
#strip () remove back and forth excess content
Return X.strip (). Encode (' Utf-8 ')
#百度贴吧爬虫练习
Class BDTB:
#初始化, incoming address, whether only see the parameters of the landlord
def __init__ (Self,baseurl,seelz,floortag):
#base链接地址
Self.baseurl = BaseURL
#是否只看楼主
Self.seelz = '? seelz= ' + str (SEELZ)
#HTML剔除标签工具Tool
Self.tool = Tool ()
#全局file变量, file write operation object
Self.file = None
#楼层标识, initialized to 1
Self.floor = 1
#默认的标题, this title will be used if the title is not successfully obtained.
Self.defaulttitle = u "Baidu paste"
#是否写入楼分隔符的标记
Self.floortag = Floortag
#传入页码, get the code for the page post
def getpage (Self,pagenum):
Try
url = self.baseurl + Self.seelz + ' &pn= ' + str (pagenum)
Request = Urllib2. Request (URL)
Response = Urllib2.urlopen (Request)
Tbpage = Response.read (). Decode (' Utf-8 ')
#print Tbpage
Return Tbpage
#链接报错的原因
Except Urllib2. Urlerror, E:
If Hasattr (E, "Reason"):
Print u ' link Baidu paste failed, error reason: ', E.reason
Return None
#获取帖子标题
def getTitle (self,page):
page = self.getpage (1)
#正则匹配贴吧标题
Pattern = Re.compile ('
result = Re.search (pattern,page)
If result:
#输出标题
#print Result.group (1)
return Result.group (1). Strip ()
Else
Return None
#获取帖子一共有多少页
def getpagenum (self,page):
page = self.getpage (1)
#正则匹配帖子总共有多少页
Pattern = Re.compile (' <li class= ' l_reply_num.*?</span>.*?<span.*?> (. *?) </span> ', Re. S
result = Re.search (pattern,page)
If result:
#输出页码数
#print Result.group (1)
return Result.group (1). Strip ()
Else
Print None
#获取帖子每一个楼层的内容
def getcontent (self,page):
#正则匹配每一个楼层的内容
Pattern = Re.compile (' <div id= ' post_content.*?> (. *?) </div> ', Re. S
Items = Re.findall (pattern,page)
#floor = 1
contents = []
For item in items:
#将文本进行去除标签处理, adding line breaks before and after
Content = "\ n" + self.tool.replace (item) + "\ n"
Contents.append (Content.encode (' Utf-8 '))
#print floor,u "Building-----------------------"
#print Content
#floor + = 1
Return contents
#设置文件的标题
def setfiletitle (Self,title):
#如果标题不是None, which is the successful acquisition of the title
If title is not None:
Self.file = open (title + ". txt", "w+")
Else
Self.file = open (Self.defaulttitle + ". txt", "w+")
#向文件写入每一楼层的信息
def writedata (self,contents):
#遍历楼层
for item in contents:
If Self.floortag = = ' 1 ':
#楼 The delimiter used between
Floorline = "\ n--------------" + str (self.floor) + "floor-----------------\ n"
Self.file.write (Unicode ( Floorline, "Utf-8"))
Self.file.write (Unicode (item, "Utf-8")
Self.floor + = 1
def start (self):
Indexpage = self.getpage (1)
Pagenum = Self.getpagenum (indexpage)
title = Self.gettitle (indexpage)
Self.setfiletitle (title)
if Pagenum = = None:
Print "URL is invalid, please try again"
Return
Try
Print "This post in total" + str (pagenum) + "page"
For I in range (1,int (pagenum) + 1):
Print "writing" + str (i) + "page data"
page = Self.getpage (i)
Contents = self.getcontent (page)
Self.writedata (contents)
Except Ioerror,e:
Print "Write exception, reason" + e.message
Finally
Print "Write task complete"
Print U "Please enter the post code"
BaseURL = ' http://tieba.baidu.com/p/' + str (raw_input (U ' http://tieba.baidu.com/p/'))
Seelz = Raw_input ("Whether only to get the landlord to speak, is input 1, no input 0\n")
Floortag = Raw_input ("Whether to write floor information, is input 1, no input 0\n")
BDTB = BDTB (Baseurl,seelz,floortag)
Bdtb.start ()
Python crawler crawl Baidu Post