Python crawler crawl Baidu Post

Source: Internet
Author: User

Also refer to the online tutorial, write crawl post content, while the crawl of the post saved to the local document:

#!/usr/bin/python
#_ *_coding:utf-8_*_
Import Urllib
Import Urllib2
Import re
Import Sys

Reload (SYS)
Sys.setdefaultencoding ("Utf-8")
#处理页面标签, remove images, hyperlinks, line breaks, and more
Class Tool:
#去除img标签, 7-bit Sky lattice
removeimg = Re.compile (' | {7}| ')
#删除超链接标签
REMOVEADDR = Re.compile (' <a.*?>|</a> ')
#把换行的标签替换为 \ n
ReplaceLine = Re.compile (' <tr>|<div>|</div>|</p> ')
#把表格制表 <td> Replace with \ t
replacetd = Re.compile (' <td> ')
#把段落开头换为 \ n plus two spaces
Replacepara = Re.compile (' <p.*?> ')
#把换行符或双换行符替换为 \ n
Replacebr = Re.compile (' <br><br>|<br> ')
#将其余标签剔除
Removeet = Re.compile (' <.*?> ')

#去除匹配到Tool
def replace (self,x):
x = Re.sub (Self.removeimg, "", X)
x = Re.sub (Self.removeaddr, "", X)
#x = Re.sub (self.replaceline, "\ n", X)
#x = Re.sub (self.replacetd, "\ T", X)
#x = Re.sub (Self.replacepara, "\ n", X)
#x = Re.sub (Self.replacebr, "\ n", X)
x = Re.sub (Self.removeet, "", X)
#strip () remove back and forth excess content
Return X.strip (). Encode (' Utf-8 ')
#百度贴吧爬虫练习
Class BDTB:

#初始化, incoming address, whether only see the parameters of the landlord
def __init__ (Self,baseurl,seelz,floortag):
#base链接地址
Self.baseurl = BaseURL
#是否只看楼主
Self.seelz = '? seelz= ' + str (SEELZ)
#HTML剔除标签工具Tool
Self.tool = Tool ()
#全局file变量, file write operation object
Self.file = None
#楼层标识, initialized to 1
Self.floor = 1
#默认的标题, this title will be used if the title is not successfully obtained.
Self.defaulttitle = u "Baidu paste"
#是否写入楼分隔符的标记
Self.floortag = Floortag

#传入页码, get the code for the page post
def getpage (Self,pagenum):
Try
url = self.baseurl + Self.seelz + ' &pn= ' + str (pagenum)
Request = Urllib2. Request (URL)
Response = Urllib2.urlopen (Request)
Tbpage = Response.read (). Decode (' Utf-8 ')
#print Tbpage
Return Tbpage
#链接报错的原因
Except Urllib2. Urlerror, E:
If Hasattr (E, "Reason"):
Print u ' link Baidu paste failed, error reason: ', E.reason
Return None

#获取帖子标题
def getTitle (self,page):
page = self.getpage (1)
#正则匹配贴吧标题
Pattern = Re.compile ('

result = Re.search (pattern,page)
If result:
#输出标题
#print Result.group (1)
return Result.group (1). Strip ()
Else
Return None

#获取帖子一共有多少页
def getpagenum (self,page):
page = self.getpage (1)
#正则匹配帖子总共有多少页
Pattern = Re.compile (' <li class= ' l_reply_num.*?</span>.*?<span.*?> (. *?) </span> ', Re. S
result = Re.search (pattern,page)
If result:
#输出页码数
#print Result.group (1)
return Result.group (1). Strip ()
Else
Print None

#获取帖子每一个楼层的内容
def getcontent (self,page):
#正则匹配每一个楼层的内容
Pattern = Re.compile (' <div id= ' post_content.*?> (. *?) </div> ', Re. S
Items = Re.findall (pattern,page)
#floor = 1
contents = []
For item in items:
#将文本进行去除标签处理, adding line breaks before and after
Content = "\ n" + self.tool.replace (item) + "\ n"
Contents.append (Content.encode (' Utf-8 '))
#print floor,u "Building-----------------------"
#print Content
#floor + = 1
Return contents

#设置文件的标题
def setfiletitle (Self,title):
#如果标题不是None, which is the successful acquisition of the title
If title is not None:
Self.file = open (title + ". txt", "w+")
Else
Self.file = open (Self.defaulttitle + ". txt", "w+")

#向文件写入每一楼层的信息
def writedata (self,contents):
#遍历楼层
for item in contents:
If Self.floortag = = ' 1 ':
#楼 The delimiter used between
Floorline = "\ n--------------" + str (self.floor) + "floor-----------------\ n"
Self.file.write (Unicode ( Floorline, "Utf-8"))
Self.file.write (Unicode (item, "Utf-8")
Self.floor + = 1

def start (self):
Indexpage = self.getpage (1)
Pagenum = Self.getpagenum (indexpage)
title = Self.gettitle (indexpage)
Self.setfiletitle (title)
if Pagenum = = None:
Print "URL is invalid, please try again"
Return
Try
Print "This post in total" + str (pagenum) + "page"
For I in range (1,int (pagenum) + 1):
Print "writing" + str (i) + "page data"
page = Self.getpage (i)
Contents = self.getcontent (page)
Self.writedata (contents)
Except Ioerror,e:
Print "Write exception, reason" + e.message
Finally
Print "Write task complete"
Print U "Please enter the post code"
BaseURL = ' http://tieba.baidu.com/p/' + str (raw_input (U ' http://tieba.baidu.com/p/'))
Seelz = Raw_input ("Whether only to get the landlord to speak, is input 1, no input 0\n")
Floortag = Raw_input ("Whether to write floor information, is input 1, no input 0\n")
BDTB = BDTB (Baseurl,seelz,floortag)
Bdtb.start ()

Python crawler crawl Baidu Post

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.