Resources:
Http://www.360doc.com/content/17/0620/16/44530822_664927373.shtml
8253360
46562821
The final Htmlparser module should read:
"" A parser for HTML and XHTML. "" "
# This file was based on sgmllib.py and the API is slightly different.
# XXX There should be a-to distinguish between PCDATA (parsed
# character data-the normal case), RCDATA (replaceable character
# Data--only char and entity references and end tags is special)
# and CDATA (character data-only end tags is special).
Import _markupbase as Markupbase
Import re
# Regular expressions used for parsing
Interesting_normal = Re.compile (' [&<] ')
Incomplete = Re.compile (' &[a-za-z#] ')
EntityRef = Re.compile (' & ([a-za-z][-.a-za-z0-9]*) [^a-za-z0-9] ')
Charref = Re.compile (?: [0-9]+|[ xx][0-9a-fa-f]+) [^0-9a-fa-f] ')
Starttagopen = Re.compile (' <[a-za-z] ')
Piclose = Re.compile (' > ')
Commentclose = Re.compile (R '--\s*> ')
Tagfind = Re.compile (' [a-za-z][-.a-za-z0-9:_]* ')
# See Http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and Http://www.w3.org/TR/html5/tokenization.html#tag-name-state
Tagfind_tolerant = Re.compile (' [a-za-z][^\t\n\r\f/>\x00]* ')
Attrfind = Re.compile (
R ' [\s/]* (<=[\ ' "\s/]) [^\s/>][^\s/=>]*] (\s*=+\s* ')
R ' (\ ' [^\ ']*\ ' | " [^"]*"| (?! [\‘"]) [^>\s]*)]? (?:\ s|/(?! >))
Locatestarttagend = Re.compile (r "" "
<[a-za-z][-.a-za-z0-9:_]* # Tag Name
(?: [\s/]* # optional whitespace before attribute name
(?:(? <=[' \s/]) [^\s/>][^\s/=>]* # attribute name
(?:\ s*=+\s* # Value Indicator
(?: ' [^ ']* ' # lita-enclosed value
|" [^ "]*" # lit-enclosed value
| (?! [‘"]) [^>\s]* # Bare value
)
)? (?:\ s|/(?! >)) *
)*
)?
\s* # trailing whitespace
"" ", Re. VERBOSE)
Endendtag = Re.compile (' > ')
# The HTML 5 spec, section 8.1.2.2, doesn ' t allow spaces between
# </and the tag name, so maybe this should is fixed
Endtagfind = Re.compile (' </\s* ([a-za-z][-.a-za-z0-9:_]*) \s*> ')
Class Htmlparseerror (Exception):
"" "Exception raised for all parse errors." "
def __init__ (self, msg, position= (None, none)):
Assert msg
self.msg = Msg
Self.lineno = position[0]
Self.offset = position[1]
def __str__ (self):
result = Self.msg
If Self.lineno is not None:
result = result + "at line%d"% Self.lineno
If Self.offset is not None:
result = result + ", column%d"% (Self.offset + 1)
return result
Class Htmlparser (Markupbase. Parserbase):
"" "Find tags and other markup and call handler functions.
Usage:
p = Htmlparser ()
P.feed (data)
...
P.close ()
Start tags is handled by calling Self.handle_starttag () or
Self.handle_startendtag (); End tags by self.handle_endtag (). The
Data between tags is passed from the parser to the derived class
by calling Self.handle_data () with the data as argument (the data
May is split up in arbitrary chunks). Entity references is
Passed by calling Self.handle_entityref () with the entity
Reference as the argument. Numeric character references is
Passed to Self.handle_charref () with the string containing the
Reference as the argument.
"""
Cdata_content_elements = ("script", "style")
def __init__ (self):
"" "Initialize and reset this instance." ""
Self.reset ()
def reset (self):
"" Reset this instance. loses all unprocessed data. "" "
Self.rawdata = ' '
Self.lasttag = '??? '
self.interesting = Interesting_normal
Self.cdata_elem = None
Markupbase. Parserbase.reset (self)
Def feed (self, data):
R "" Feed data to the parser.
Call this as often as want, with as little or as much text
As you want (may include ' \ n ').
"""
Self.rawdata = self.rawdata + data
Self.goahead (0)
def close (self):
"" "Handle any buffered data." ""
Self.goahead (1)
def error (self, message):
Raise Htmlparseerror (Message, Self.getpos ())
__starttag_text = None
def get_starttag_text (self):
"" "Return full source of start tag: ' <...> '." ""
Return Self.__starttag_text
def set_cdata_mode (self, elem):
Self.cdata_elem = Elem.lower ()
self.interesting = Re.compile (R ' </\s*%s\s*> '% Self.cdata_elem, re. I)
def clear_cdata_mode (self):
self.interesting = Interesting_normal
Self.cdata_elem = None
# Internal--handle data as far as reasonable. May leave state
# and data to being processed by a subsequent call. If ' End ' is
# True, force handling any data as if followed by EOF marker.
def goahead (self, End):
RawData = Self.rawdata
i = 0
n = Len (rawdata)
While I < n:
Match = Self.interesting.search (RawData, i) # < or &
If match:
j = Match.start ()
Else
If Self.cdata_elem:
Break
j = N
If I < J:self.handle_data (Rawdata[i:j])
i = Self.updatepos (i, J)
if i = = N:break
StartsWith = Rawdata.startswith
If StartsWith (' < ', i):
If Starttagopen.match (RawData, i): # < + letter
K = Self.parse_starttag (i)
Elif startswith ("</", i):
K = Self.parse_endtag (i)
Elif StartsWith ("<!--", i):
K = Self.parse_comment (i)
Elif startswith ("<?", I):
K = Self.parse_pi (i)
Elif startswith ("<!", i):
K = Self.parse_html_declaration (i)
Elif (i + 1) < n:
Self.handle_data ("<")
K = i + 1
Else
Break
If k < 0:
If not end:
Break
k = Rawdata.find (' > ', i + 1)
If k < 0:
k = Rawdata.find (' < ', i + 1)
If k < 0:
K = i + 1
Else
K + = 1
Self.handle_data (Rawdata[i:k])
i = Self.updatepos (i, K)
Elif StartsWith ("the", I):
Match = Charref.match (RawData, i)
If match:
Name = Match.group () [2:-1]
Self.handle_charref (name)
K = Match.end ()
If not startswith ('; ', k-1):
K = K-1
i = Self.updatepos (i, K)
Continue
Else
If ";" in Rawdata[i:]: #bail by consuming
Self.handle_data (Rawdata[0:2])
i = Self.updatepos (i, 2)
Break
Elif startswith (' & ', i):
Match = Entityref.match (RawData, i)
If match:
Name = Match.group (1)
Self.handle_entityref (name)
K = Match.end ()
If not startswith ('; ', k-1):
K = K-1
i = Self.updatepos (i, K)
Continue
Match = Incomplete.match (RawData, i)
If match:
# Match.group () would contain at least 2 chars
If End and match.group () = = Rawdata[i:]:
Self.error ("EOF in middle of entity or char ref")
# Incomplete
Break
Elif (i + 1) < n:
# The end of the buffer, and can ' t be confused
# with some and other construct
Self.handle_data ("&")
i = Self.updatepos (i, i + 1)
Else
Break
Else
Assert 0, "interesting.search () Lied"
# End While
If End and I < n and not Self.cdata_elem:
Self.handle_data (Rawdata[i:n])
i = Self.updatepos (i, N)
Self.rawdata = Rawdata[i:]
# Internal--Parse HTML declarations, return length or-1 if not terminated
# See W3.org/tr/html5/tokenization.html#markup-declaration-open-state
# See also parse_declaration in _markupbase
def parse_html_declaration (self, i):
RawData = Self.rawdata
If rawdata[i:i+2]! = ' <! ':
Self.error (' Unexpected call to Parse_html_declaration () ')
If rawdata[i:i+4] = = ' <!--':
# This was actually already handled in Goahead ()
return Self.parse_comment (i)
Elif rawdata[i:i+3] = = ' <! [‘:
return Self.parse_marked_section (i)
Elif rawdata[i:i+9].lower () = = ' <!doctype ':
# Find the closing >
Gtpos = Rawdata.find (' > ', i+9)
if Gtpos = =-1:
Return-1
Self.handle_decl (Rawdata[i+2:gtpos])
Return gtpos+1
Else
return Self.parse_bogus_comment (i)
# Internal--Parse bogus comment, return length or-1 if not terminated
# See Http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment (self, I, report=1):
RawData = Self.rawdata
If rawdata[i:i+2] not in (' <! ', ' </'):
Self.error (' Unexpected call to Parse_comment () ')
pos = Rawdata.find (' > ', i+2)
if pos = =-1:
Return-1
If report:
Self.handle_comment (Rawdata[i+2:pos])
return POS + 1
# Internal--Parse processing InStr, return end or-1 if not terminated
def parse_pi (self, i):
RawData = Self.rawdata
Assert rawdata[i:i+2] = = ' <? ', ' Unexpected call to Parse_pi () '
Match = Piclose.search (RawData, i+2) # >
If not match:
Return-1
j = Match.start ()
SELF.HANDLE_PI (Rawdata[i+2:j])
j = Match.end ()
Return J
# Internal--Handle Starttag, return end or-1 if not terminated
def parse_starttag (self, i):
Self.__starttag_text = None
Endpos = Self.check_for_whole_start_tag (i)
If Endpos < 0:
Return Endpos
RawData = Self.rawdata
Self.__starttag_text = Rawdata[i:endpos]
# now parse the data between I+1 and J into a tag and attrs
Attrs = []
Match = Tagfind.match (RawData, i+1)
Assert match, ' unexpected call to Parse_starttag () '
K = Match.end ()
Self.lasttag = Tag = Rawdata[i+1:k].lower ()
While K < endpos:
m = Attrfind.match (RawData, K)
If not m:
Break
Attrname, rest, Attrvalue = M.group (1, 2, 3)
If not rest:
Attrvalue = None
Elif Attrvalue[:1] = = ' \ ' = = attrvalue[-1:] or \
Attrvalue[:1] = = ' "' = = Attrvalue[-1:]:
Attrvalue = Attrvalue[1:-1]
If Attrvalue:
Attrvalue = Self.unescape (attrvalue)
Attrs.append ((Attrname.lower (), Attrvalue))
K = M.end ()
End = Rawdata[k:endpos].strip ()
If End not in (">", "/>"):
Lineno, offset = Self.getpos ()
If "\ n" in Self.__starttag_text:
Lineno = Lineno + self.__starttag_text.count ("\ n")
offset = len (self.__starttag_text) \
-Self.__starttag_text.rfind ("\ n")
Else
Offset = offset + len (self.__starttag_text)
Self.handle_data (Rawdata[i:endpos])
Return Endpos
If End.endswith ('/> '):
# Xhtml-style empty tag: <span attr= "value"/>
Self.handle_startendtag (tag, attrs)
Else
Self.handle_starttag (tag, attrs)
If tag in self. Cdata_content_elements:
Self.set_cdata_mode (TAG)
Return Endpos
# Internal--Check to see if we had a complete starttag; Return end
# or-1 if incomplete.
def check_for_whole_start_tag (self, i):
RawData = Self.rawdata
m = Locatestarttagend.match (RawData, i)
If M:
j = M.end ()
Next = rawdata[j:j+1]
If Next = = ">":
Return J + 1
If Next = = "/":
If Rawdata.startswith ("/>", j):
Return J + 2
If Rawdata.startswith ("/", J):
# buffer Boundary
Return-1
# Else Bogus input
Self.updatepos (I, j + 1)
Self.error ("Malformed empty start Tag")
If Next = = "":
# End of input
Return-1
If Next in ("abcdefghijklmnopqrstuvwxyz=/"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
# End of input in or before attribute value, or we have the
# '/' from a '/> ' ending
Return-1
If J > I:
Return J
Else
return i + 1
Raise Assertionerror ("We should not get here!")
# Internal--Parse Endtag, return end or-1 if incomplete
def parse_endtag (self, i):
RawData = Self.rawdata
Assert rawdata[i:i+2] = = "</", "unexpected call to Parse_endtag"
Match = Endendtag.search (RawData, i+1) # >
If not match:
Return-1
Gtpos = Match.end ()
Match = Endtagfind.match (RawData, i) # </+ tag + >
If not match:
If Self.cdata_elem is not None:
Self.handle_data (Rawdata[i:gtpos])
Return Gtpos
# Find the Name:w3.org/tr/html5/tokenization.html#tag-name-state
Namematch = Tagfind_tolerant.match (RawData, i+2)
If not namematch:
# w3.org/tr/html5/tokenization.html#end-tag-open-state
If rawdata[i:i+3] = = ' </> ':
Return i+3
Else
return Self.parse_bogus_comment (i)
TagName = Namematch.group (). Lower ()
# consume and ignore other stuff between the name and the >
# note:this is not 100% correct, since we might has things like
# </tag attr= ">", but looking for > after tha name should cover
# Most of the cases and is much simpler
Gtpos = Rawdata.find (' > ', Namematch.end ())
Self.handle_endtag (tagname)
Return gtpos+1
Elem = Match.group (1). Lower () # script or style
If Self.cdata_elem is not None:
If Elem! = Self.cdata_elem:
Self.handle_data (Rawdata[i:gtpos])
Return Gtpos
Self.handle_endtag (Elem)
Self.clear_cdata_mode ()
Return Gtpos
# Overridable--Finish processing of start+end tag: <tag.../>
def handle_startendtag (self, Tag, attrs):
Self.handle_starttag (tag, attrs)
Self.handle_endtag (TAG)
# Overridable--Handle start tag
def handle_starttag (self, Tag, attrs):
Pass
# Overridable--Handle end tag
def handle_endtag (self, Tag):
Pass
# Overridable--Handle character reference
def handle_charref (self, name):
Pass
# Overridable--Handle entity reference
def handle_entityref (self, name):
Pass
# Overridable--Handle data
def handle_data (self, data):
Pass
# Overridable--Handle Comment
def handle_comment (self, data):
Pass
# Overridable--Handle declaration
def handle_decl (self, decl):
Pass
# Overridable--Handle processing instruction
def handle_pi (self, data):
Pass
def unknown_decl (self, data):
Pass
# Internal--Helper to remove special character quoting
Entitydefs = None
def unescape (self, s):
If ' & ' not in S:
return s
def replaceentities (s):
s = s.groups () [0]
Try
If s[0] = = "#":
s = s[1:]
If s[0] in [' X ', ' X ']:
c = Int (s[1:], 16)
Else
c = Int (s)
return Chr (c)
Except ValueError:
Return ' +s+ ';
Else
# cannot use Name2codepoint directly, because Htmlparser supports APOs,
# which is not part of HTML 4
Import Html.entities as Htmlentitydefs
If Htmlparser.entitydefs is None:
Entitydefs = Htmlparser.entitydefs = {' apos ': U ' "}
For K, V in Htmlentitydefs.name2codepoint.items ():
ENTITYDEFS[K] = Chr (v)
Try
return Self.entitydefs[s]
Except Keyerror:
Return ' & ' +s+ '; '
Return Re.sub (r "& (#?[ XX]? (?: [0-9a-fa-f]+|\w{1,8})); ", Replaceentities, s)
Python turns the HTML entity back