#_*_coding:utf-8-*-ImportUrllib2ImportTracebackImportCodecs fromBeautifulSoupImportBeautifulSoupdefOpensoup (Url,code): Page=urllib2.urlopen (URL) soup= BeautifulSoup (Page,fromencoding=code)#, fromencoding= "gb2312" #soup = BeautifulSoup (page,code) returnSoupdefGetcontentfromdiv (contents): S="" forContentinchContents:Try: S+=contentexcept: Passs=S.lstrip (). Rstrip ()ifLen (s) < 50: return "" Else: return " "+s+"\ r \ n"+"\ r \ n"defreadhtml (soup,fp,authname): PageContent=""Item= Soup.find (name='Div', attrs={'class':'bbs-content Clearfix'}) ifItem! =none:pagecontent+=Getcontentfromdiv (item.contents) Items= Soup.findall (name='Div', attrs={'class':'Atl-item'}) forIteminchItems:useritem= Item.find (name='a', attrs={'class':'Js-vip-check'}) ifUseritem = = NoneorUSERITEM.CONTENTS[0]! =AuthName:ContinueContentItem= Item.find (name='Div', attrs={'class':'bbs-content'}) PageContent+=Getcontentfromdiv (contentitem.contents) fp.write (pagecontent)defgetnextpage (soup,pno): Nextlink= Soup.find (name="a", attrs={"class":"Js-keyboard-next"}) ifNextlink! =None:return "http://bbs.tianya.cn"+nextlink["href"] Else: return ' Over'defgethtml (url,filename,authname): P= 1FP= Codecs.open (filename,'W','Utf-8') whileTrue:soup= Opensoup (URL,'Utf-8') readhtml (soup,fp,authname) URL= Getnextpage (soup,p+1) ifURL = =' Over' : Break Print 'PAGE'+STR (P) +'OK'P= p + 1Print 'it\ ' s over'fp.close ()if __name__=='__main__': gethtml ('http://bbs.tianya.cn/post-no05-143258-1.shtml','Krzc.txt', u'Guan River 50 states') #gethtml (' http://bbs.tianya.cn/post-no05-143258-1036.shtml ', ' krzc.txt ', U ' off River 50 states ')