Code:
#_*_ coding:utf-8 _*_ImportUrllibImportUrllib2ImportReclasstool:removingimg= Re.compile ('| {7}|') removingaddr= Re.compile ('<a.*?>|</a>') ReplaceLine= Re.compile ('<tr>|<div>|</div>|</p>') replacetd= Re.compile ('<td>') Replacepara= Re.compile ('<p.*?>') Replacebr= Re.compile ('<br><br>|<br>') Removeextratag= Re.compile ('<.*?>') defreplace (self,x): x= Re.sub (self.removingimg,"", x) x= Re.sub (SELF.REMOVINGADDR,"", x) x= Re.sub (Self.replaceline,"\ n", x) x= Re.sub (self.replacetd,"\ t", x) x= Re.sub (Self.replacepara,"\ n", x) x= Re.sub (Self.replacebr,"\ n", x) x= Re.sub (Self.removeextratag,"", X)returnX.strip ()classBDTB:def __init__(Self,baseurl,seelz): Self.baseurl=BASEURL Self.seelz='? see_lz='+Str (Seelz) Self.tool=Tool ()defGetPage (self,pagenum):Try: URL= Self.baseurl + Self.seelz +'&pn='+Str (pagenum) Request=Urllib2. Request (URL) Response=Urllib2.urlopen (Request) Pagecode= Response.read (). Decode ('Utf-8') #Print Pagecode returnPagecodeexceptUrllib2. Urlerror,e:ifHasattr (E,"reason"): PrintU"link Baidu Paste failed, the cause of the error", E.reasonreturnNonedefGetTitle (self): page= Self.getpage (1) Pattern= Re.compile ('', Re. S) Result=Re.search (pattern,page)ifResult:#print Result.group (1) returnResult.group (1). Strip ()Else: #print "not match" returnNonedefGetpagenum (self): page= Self.getpage (1) Pattern= Re.compile ('<li class= "l_reply_num.*?</span>.*?<span.*?> (. *?) </span>', Re. S) Result=Re.search (pattern,page)ifResult:#print Result.group (1) returnResult.group (1). Strip ()Else: #print "not match" returnNonedefgetcontent (self,page): Pattern= Re.compile ('<div id= "post_content_.*?> (. *?) </div>', Re. S) Items=Re.findall (pattern,page)#print Self.tool.replace (items[1])Floor = 1 forIteminchItems:PrintFloor,u"Lou-----------------------------------------\ n"content=self.tool.replace (item) floor+ = 1PrintContentbaseurl='http://tieba.baidu.com/p/3138733512'BDTB= BDTB (baseurl,1) Page= Bdtb.getpage (1) Bdtb.gettitle () Bdtb.getpagenum () bdtb.getcontent (page)
Python crawler Combat (ii): Crawl Baidu Bar