#encoding: UTF-8
Import Urllib.parse
Import Urllib.request
Import Base64
Import re
Import Sys
Import time
From random Import sample
Import Codecs
From Html.parser import Htmlparser
Log = ' Gogogo.txt '
LogFile = codecs.open (log, ' W ', ' Utf-8 ')
Class Myhtmlparser (Htmlparser):
def __init__ (self):
Htmlparser.__init__ (self)
Self.a=0
self.span=0;
def handle_starttag (self,tag,attrs):
If tag== ' a ':
For Name,value in Attrs:
If name== ' class ' and value== ' tit ':
Self.a=1
If tag== ' span ':
For Name,value in Attrs:
If name== ' class ' and value== ' reply ':
Self.span=1
def handle_endtag (self, Tag):
if tag = = ' A ' and self.a==1:
Self.a=0
Logfile.write (' | ')
If tag== ' span ' and self.span==1:
Self.span=0
Logfile.write (' \ n ')
def handle_data (self, data):
if (SELF.A or Self.span):
Logfile.write (data)
Parser = Myhtmlparser ()
def getpage (URL):
req = urllib.request.Request (URL)
Response = Urllib.request.urlopen (req)
The_page = Response.read ()
Return The_page
For I in Range (1,405):
Url= ' http://bbs.qyer.com/forum-52-' +str (i) + '. html '
Page=getpage (URL). Decode (' utf-8 ', ' ignore ')
Parser.feed (page)
Print (i)
Python crawl Header and author time applet