標籤:
#encoding:UTF-8
import urllib.parse
import urllib.request
import base64
import re
import sys
import time
from random import sample
import codecs
from html.parser import HTMLParser
log = ‘gogogo.txt‘
logfile = codecs.open(log,‘w‘,‘utf-8‘)
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.a=0
self.span=0;
def handle_starttag(self,tag,attrs):
if tag==‘a‘:
for name,value in attrs:
if name==‘class‘ and value==‘tit‘:
self.a=1
if tag==‘span‘:
for name,value in attrs:
if name==‘class‘ and value==‘reply‘:
self.span=1
def handle_endtag(self, tag):
if tag == ‘a‘ and self.a==1:
self.a=0
logfile.write(‘|‘)
if tag==‘span‘ and self.span==1:
self.span=0
logfile.write(‘\n‘)
def handle_data(self, data):
if (self.a or self.span):
logfile.write(data)
parser = MyHTMLParser()
def getpage(url):
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
the_page = response.read()
return the_page
for i in range(1,405):
url=‘http://bbs.qyer.com/forum-52-‘+str(i)+‘.html‘
page=getpage(url).decode(‘utf-8‘,‘ignore‘)
parser.feed(page)
print(i)
python爬取標題和作者時間的小程式