操作的url: http://tieba.baidu.com/p/2887364960?see_lz=1&pn=1
import urllib.requestimport http.clientimport bs4import repattern=re.compile('post_content_[0-9]{1,}')def IsRightId(id): group=pattern.findall(id) if group: return group else : return Nonedef GetEveryPage(url,file): try: rep=urllib.request.urlopen(url) except http.client.HTTPException as e: print(repr(e)) else: rep_utf=rep.read().decode('gbk').encode(encoding='utf-8').decode('utf-8') soup=bs4.BeautifulSoup(rep_utf) for div in soup.find_all('div'): div_id=div.get('id') if IsRightId(str(div_id)): file.write(div.get_text()+'\r\n') url='http://tieba.baidu.com/p/2887364960?see_lz=1&pn='file_tieba=open('007.txt','w+',encoding='utf-8')for i in range(1,17): url_everypage=url+str(i) print('Processing page:'+str(i)+str('/16......')) GetEveryPage(url_everypage,file_tieba)print('Finished!')file_tieba.close()