Python crawler crawl Baidu Post

Last Update:2016-11-19 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Also refer to the online tutorial, write crawl post content, while the crawl of the post saved to the local document:

#!/usr/bin/python
#_ *_coding:utf-8_*_
Import Urllib
Import Urllib2
Import re
Import Sys

Reload (SYS)
Sys.setdefaultencoding ("Utf-8")
#处理页面标签, remove images, hyperlinks, line breaks, and more
Class Tool:
#去除img标签, 7-bit Sky lattice
removeimg = Re.compile (' | {7}| ')
#删除超链接标签
REMOVEADDR = Re.compile (' <a.*?>|</a> ')
#把换行的标签替换为 \ n
ReplaceLine = Re.compile (' <tr>|<div>|</div>|</p> ')
#把表格制表 <td> Replace with \ t
replacetd = Re.compile (' <td> ')
#把段落开头换为 \ n plus two spaces
Replacepara = Re.compile (' <p.*?> ')
#把换行符或双换行符替换为 \ n
Replacebr = Re.compile (' <br><br>|<br> ')
#将其余标签剔除
Removeet = Re.compile (' <.*?> ')

#去除匹配到Tool
def replace (self,x):
x = Re.sub (Self.removeimg, "", X)
x = Re.sub (Self.removeaddr, "", X)
#x = Re.sub (self.replaceline, "\ n", X)
#x = Re.sub (self.replacetd, "\ T", X)
#x = Re.sub (Self.replacepara, "\ n", X)
#x = Re.sub (Self.replacebr, "\ n", X)
x = Re.sub (Self.removeet, "", X)
#strip () remove back and forth excess content
Return X.strip (). Encode (' Utf-8 ')
#百度贴吧爬虫练习
Class BDTB:

#初始化, incoming address, whether only see the parameters of the landlord
def __init__ (Self,baseurl,seelz,floortag):
#base链接地址
Self.baseurl = BaseURL
#是否只看楼主
Self.seelz = '? seelz= ' + str (SEELZ)
#HTML剔除标签工具Tool
Self.tool = Tool ()
#全局file变量, file write operation object
Self.file = None
#楼层标识, initialized to 1
Self.floor = 1
#默认的标题, this title will be used if the title is not successfully obtained.
Self.defaulttitle = u "Baidu paste"
#是否写入楼分隔符的标记
Self.floortag = Floortag

#传入页码, get the code for the page post
def getpage (Self,pagenum):
Try
url = self.baseurl + Self.seelz + ' &pn= ' + str (pagenum)
Request = Urllib2. Request (URL)
Response = Urllib2.urlopen (Request)
Tbpage = Response.read (). Decode (' Utf-8 ')
#print Tbpage
Return Tbpage
#链接报错的原因
Except Urllib2. Urlerror, E:
If Hasattr (E, "Reason"):
Print u ' link Baidu paste failed, error reason: ', E.reason
Return None

#获取帖子标题
def getTitle (self,page):
page = self.getpage (1)
#正则匹配贴吧标题
Pattern = Re.compile ('

result = Re.search (pattern,page)
If result:
#输出标题
#print Result.group (1)
return Result.group (1). Strip ()
Else
Return None

#获取帖子一共有多少页
def getpagenum (self,page):
page = self.getpage (1)
#正则匹配帖子总共有多少页
Pattern = Re.compile (' <li class= ' l_reply_num.*?</span>.*?<span.*?> (. *?) </span> ', Re. S
result = Re.search (pattern,page)
If result:
#输出页码数
#print Result.group (1)
return Result.group (1). Strip ()
Else
Print None

#获取帖子每一个楼层的内容
def getcontent (self,page):
#正则匹配每一个楼层的内容
Pattern = Re.compile (' <div id= ' post_content.*?> (. *?) </div> ', Re. S
Items = Re.findall (pattern,page)
#floor = 1
contents = []
For item in items:
#将文本进行去除标签处理, adding line breaks before and after
Content = "\ n" + self.tool.replace (item) + "\ n"
Contents.append (Content.encode (' Utf-8 '))
#print floor,u "Building-----------------------"
#print Content
#floor + = 1
Return contents

#设置文件的标题
def setfiletitle (Self,title):
#如果标题不是None, which is the successful acquisition of the title
If title is not None:
Self.file = open (title + ". txt", "w+")
Else
Self.file = open (Self.defaulttitle + ". txt", "w+")

#向文件写入每一楼层的信息
def writedata (self,contents):
#遍历楼层
for item in contents:
If Self.floortag = = ' 1 ':
#楼 The delimiter used between
Floorline = "\ n--------------" + str (self.floor) + "floor-----------------\ n"
Self.file.write (Unicode ( Floorline, "Utf-8"))
Self.file.write (Unicode (item, "Utf-8")
Self.floor + = 1

def start (self):
Indexpage = self.getpage (1)
Pagenum = Self.getpagenum (indexpage)
title = Self.gettitle (indexpage)
Self.setfiletitle (title)
if Pagenum = = None:
Print "URL is invalid, please try again"
Return
Try
Print "This post in total" + str (pagenum) + "page"
For I in range (1,int (pagenum) + 1):
Print "writing" + str (i) + "page data"
page = Self.getpage (i)
Contents = self.getcontent (page)
Self.writedata (contents)
Except Ioerror,e:
Print "Write exception, reason" + e.message
Finally
Print "Write task complete"
Print U "Please enter the post code"
BaseURL = ' http://tieba.baidu.com/p/' + str (raw_input (U ' http://tieba.baidu.com/p/'))
Seelz = Raw_input ("Whether only to get the landlord to speak, is input 1, no input 0\n")
Floortag = Raw_input ("Whether to write floor information, is input 1, no input 0\n")
BDTB = BDTB (Baseurl,seelz,floortag)
Bdtb.start ()

Python crawler crawl Baidu Post

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python crawler crawl Baidu Post

Contact Us

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support