Python turns the HTML entity back

Source: Internet
Author: User
Tags assert processing instruction tag name tagname

Resources:

Http://www.360doc.com/content/17/0620/16/44530822_664927373.shtml

8253360

46562821

The final Htmlparser module should read:

"" A parser for HTML and XHTML. "" "

# This file was based on sgmllib.py and the API is slightly different.

# XXX There should be a-to distinguish between PCDATA (parsed
# character data-the normal case), RCDATA (replaceable character
# Data--only char and entity references and end tags is special)
# and CDATA (character data-only end tags is special).


Import _markupbase as Markupbase
Import re

# Regular expressions used for parsing

Interesting_normal = Re.compile (' [&<] ')
Incomplete = Re.compile (' &[a-za-z#] ')

EntityRef = Re.compile (' & ([a-za-z][-.a-za-z0-9]*) [^a-za-z0-9] ')
Charref = Re.compile (?: [0-9]+|[ xx][0-9a-fa-f]+) [^0-9a-fa-f] ')

Starttagopen = Re.compile (' <[a-za-z] ')
Piclose = Re.compile (' > ')
Commentclose = Re.compile (R '--\s*> ')
Tagfind = Re.compile (' [a-za-z][-.a-za-z0-9:_]* ')
# See Http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and Http://www.w3.org/TR/html5/tokenization.html#tag-name-state
Tagfind_tolerant = Re.compile (' [a-za-z][^\t\n\r\f/>\x00]* ')

Attrfind = Re.compile (
R ' [\s/]* (<=[\ ' "\s/]) [^\s/>][^\s/=>]*] (\s*=+\s* ')
R ' (\ ' [^\ ']*\ ' | " [^"]*"| (?! [\‘"]) [^>\s]*)]? (?:\ s|/(?! >))

Locatestarttagend = Re.compile (r "" "
<[a-za-z][-.a-za-z0-9:_]* # Tag Name
(?: [\s/]* # optional whitespace before attribute name
(?:(? <=[' \s/]) [^\s/>][^\s/=>]* # attribute name
(?:\ s*=+\s* # Value Indicator
(?: ' [^ ']* ' # lita-enclosed value
|" [^ "]*" # lit-enclosed value
| (?! [‘"]) [^>\s]* # Bare value
)
)? (?:\ s|/(?! >)) *
)*
)?
\s* # trailing whitespace
"" ", Re. VERBOSE)
Endendtag = Re.compile (' > ')
# The HTML 5 spec, section 8.1.2.2, doesn ' t allow spaces between
# </and the tag name, so maybe this should is fixed
Endtagfind = Re.compile (' </\s* ([a-za-z][-.a-za-z0-9:_]*) \s*> ')


Class Htmlparseerror (Exception):
"" "Exception raised for all parse errors." "

def __init__ (self, msg, position= (None, none)):
Assert msg
self.msg = Msg
Self.lineno = position[0]
Self.offset = position[1]

def __str__ (self):
result = Self.msg
If Self.lineno is not None:
result = result + "at line%d"% Self.lineno
If Self.offset is not None:
result = result + ", column%d"% (Self.offset + 1)
return result


Class Htmlparser (Markupbase. Parserbase):
"" "Find tags and other markup and call handler functions.

Usage:
p = Htmlparser ()
P.feed (data)
...
P.close ()

Start tags is handled by calling Self.handle_starttag () or
Self.handle_startendtag (); End tags by self.handle_endtag (). The
Data between tags is passed from the parser to the derived class
by calling Self.handle_data () with the data as argument (the data
May is split up in arbitrary chunks). Entity references is
Passed by calling Self.handle_entityref () with the entity
Reference as the argument. Numeric character references is
Passed to Self.handle_charref () with the string containing the
Reference as the argument.
"""

Cdata_content_elements = ("script", "style")


def __init__ (self):
"" "Initialize and reset this instance." ""
Self.reset ()

def reset (self):
"" Reset this instance. loses all unprocessed data. "" "
Self.rawdata = ' '
Self.lasttag = '??? '
self.interesting = Interesting_normal
Self.cdata_elem = None
Markupbase. Parserbase.reset (self)

Def feed (self, data):
R "" Feed data to the parser.

Call this as often as want, with as little or as much text
As you want (may include ' \ n ').
"""
Self.rawdata = self.rawdata + data
Self.goahead (0)

def close (self):
"" "Handle any buffered data." ""
Self.goahead (1)

def error (self, message):
Raise Htmlparseerror (Message, Self.getpos ())

__starttag_text = None

def get_starttag_text (self):
"" "Return full source of start tag: ' <...> '." ""
Return Self.__starttag_text

def set_cdata_mode (self, elem):
Self.cdata_elem = Elem.lower ()
self.interesting = Re.compile (R ' </\s*%s\s*> '% Self.cdata_elem, re. I)

def clear_cdata_mode (self):
self.interesting = Interesting_normal
Self.cdata_elem = None

# Internal--handle data as far as reasonable. May leave state
# and data to being processed by a subsequent call. If ' End ' is
# True, force handling any data as if followed by EOF marker.
def goahead (self, End):
RawData = Self.rawdata
i = 0
n = Len (rawdata)
While I < n:
Match = Self.interesting.search (RawData, i) # < or &
If match:
j = Match.start ()
Else
If Self.cdata_elem:
Break
j = N
If I < J:self.handle_data (Rawdata[i:j])
i = Self.updatepos (i, J)
if i = = N:break
StartsWith = Rawdata.startswith
If StartsWith (' < ', i):
If Starttagopen.match (RawData, i): # < + letter
K = Self.parse_starttag (i)
Elif startswith ("</", i):
K = Self.parse_endtag (i)
Elif StartsWith ("<!--", i):
K = Self.parse_comment (i)
Elif startswith ("&LT;?", I):
K = Self.parse_pi (i)
Elif startswith ("<!", i):
K = Self.parse_html_declaration (i)
Elif (i + 1) < n:
Self.handle_data ("<")
K = i + 1
Else
Break
If k < 0:
If not end:
Break
k = Rawdata.find (' > ', i + 1)
If k < 0:
k = Rawdata.find (' < ', i + 1)
If k < 0:
K = i + 1
Else
K + = 1
Self.handle_data (Rawdata[i:k])
i = Self.updatepos (i, K)
Elif StartsWith ("the", I):
Match = Charref.match (RawData, i)
If match:
Name = Match.group () [2:-1]
Self.handle_charref (name)
K = Match.end ()
If not startswith ('; ', k-1):
K = K-1
i = Self.updatepos (i, K)
Continue
Else
If ";" in Rawdata[i:]: #bail by consuming
Self.handle_data (Rawdata[0:2])
i = Self.updatepos (i, 2)
Break
Elif startswith (' & ', i):
Match = Entityref.match (RawData, i)
If match:
Name = Match.group (1)
Self.handle_entityref (name)
K = Match.end ()
If not startswith ('; ', k-1):
K = K-1
i = Self.updatepos (i, K)
Continue
Match = Incomplete.match (RawData, i)
If match:
# Match.group () would contain at least 2 chars
If End and match.group () = = Rawdata[i:]:
Self.error ("EOF in middle of entity or char ref")
# Incomplete
Break
Elif (i + 1) < n:
# The end of the buffer, and can ' t be confused
# with some and other construct
Self.handle_data ("&")
i = Self.updatepos (i, i + 1)
Else
Break
Else
Assert 0, "interesting.search () Lied"
# End While
If End and I < n and not Self.cdata_elem:
Self.handle_data (Rawdata[i:n])
i = Self.updatepos (i, N)
Self.rawdata = Rawdata[i:]

# Internal--Parse HTML declarations, return length or-1 if not terminated
# See W3.org/tr/html5/tokenization.html#markup-declaration-open-state
# See also parse_declaration in _markupbase
def parse_html_declaration (self, i):
RawData = Self.rawdata
If rawdata[i:i+2]! = ' <! ':
Self.error (' Unexpected call to Parse_html_declaration () ')
If rawdata[i:i+4] = = ' <!--':
# This was actually already handled in Goahead ()
return Self.parse_comment (i)
Elif rawdata[i:i+3] = = ' <! [‘:
return Self.parse_marked_section (i)
Elif rawdata[i:i+9].lower () = = ' <!doctype ':
# Find the closing >
Gtpos = Rawdata.find (' > ', i+9)
if Gtpos = =-1:
Return-1
Self.handle_decl (Rawdata[i+2:gtpos])
Return gtpos+1
Else
return Self.parse_bogus_comment (i)

# Internal--Parse bogus comment, return length or-1 if not terminated
# See Http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment (self, I, report=1):
RawData = Self.rawdata
If rawdata[i:i+2] not in (' <! ', ' </'):
Self.error (' Unexpected call to Parse_comment () ')
pos = Rawdata.find (' > ', i+2)
if pos = =-1:
Return-1
If report:
Self.handle_comment (Rawdata[i+2:pos])
return POS + 1

# Internal--Parse processing InStr, return end or-1 if not terminated
def parse_pi (self, i):
RawData = Self.rawdata
Assert rawdata[i:i+2] = = ' <? ', ' Unexpected call to Parse_pi () '
Match = Piclose.search (RawData, i+2) # >
If not match:
Return-1
j = Match.start ()
SELF.HANDLE_PI (Rawdata[i+2:j])
j = Match.end ()
Return J

# Internal--Handle Starttag, return end or-1 if not terminated
def parse_starttag (self, i):
Self.__starttag_text = None
Endpos = Self.check_for_whole_start_tag (i)
If Endpos < 0:
Return Endpos
RawData = Self.rawdata
Self.__starttag_text = Rawdata[i:endpos]

# now parse the data between I+1 and J into a tag and attrs
Attrs = []
Match = Tagfind.match (RawData, i+1)
Assert match, ' unexpected call to Parse_starttag () '
K = Match.end ()
Self.lasttag = Tag = Rawdata[i+1:k].lower ()

While K < endpos:
m = Attrfind.match (RawData, K)
If not m:
Break
Attrname, rest, Attrvalue = M.group (1, 2, 3)
If not rest:
Attrvalue = None
Elif Attrvalue[:1] = = ' \ ' = = attrvalue[-1:] or \
Attrvalue[:1] = = ' "' = = Attrvalue[-1:]:
Attrvalue = Attrvalue[1:-1]
If Attrvalue:
Attrvalue = Self.unescape (attrvalue)
Attrs.append ((Attrname.lower (), Attrvalue))
K = M.end ()

End = Rawdata[k:endpos].strip ()
If End not in (">", "/>"):
Lineno, offset = Self.getpos ()
If "\ n" in Self.__starttag_text:
Lineno = Lineno + self.__starttag_text.count ("\ n")
offset = len (self.__starttag_text) \
-Self.__starttag_text.rfind ("\ n")
Else
Offset = offset + len (self.__starttag_text)
Self.handle_data (Rawdata[i:endpos])
Return Endpos
If End.endswith ('/> '):
# Xhtml-style empty tag: <span attr= "value"/>
Self.handle_startendtag (tag, attrs)
Else
Self.handle_starttag (tag, attrs)
If tag in self. Cdata_content_elements:
Self.set_cdata_mode (TAG)
Return Endpos

# Internal--Check to see if we had a complete starttag; Return end
# or-1 if incomplete.
def check_for_whole_start_tag (self, i):
RawData = Self.rawdata
m = Locatestarttagend.match (RawData, i)
If M:
j = M.end ()
Next = rawdata[j:j+1]
If Next = = ">":
Return J + 1
If Next = = "/":
If Rawdata.startswith ("/>", j):
Return J + 2
If Rawdata.startswith ("/", J):
# buffer Boundary
Return-1
# Else Bogus input
Self.updatepos (I, j + 1)
Self.error ("Malformed empty start Tag")
If Next = = "":
# End of input
Return-1
If Next in ("abcdefghijklmnopqrstuvwxyz=/"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
# End of input in or before attribute value, or we have the
# '/' from a '/> ' ending
Return-1
If J > I:
Return J
Else
return i + 1
Raise Assertionerror ("We should not get here!")

# Internal--Parse Endtag, return end or-1 if incomplete
def parse_endtag (self, i):
RawData = Self.rawdata
Assert rawdata[i:i+2] = = "</", "unexpected call to Parse_endtag"
Match = Endendtag.search (RawData, i+1) # >
If not match:
Return-1
Gtpos = Match.end ()
Match = Endtagfind.match (RawData, i) # </+ tag + >
If not match:
If Self.cdata_elem is not None:
Self.handle_data (Rawdata[i:gtpos])
Return Gtpos
# Find the Name:w3.org/tr/html5/tokenization.html#tag-name-state
Namematch = Tagfind_tolerant.match (RawData, i+2)
If not namematch:
# w3.org/tr/html5/tokenization.html#end-tag-open-state
If rawdata[i:i+3] = = ' </> ':
Return i+3
Else
return Self.parse_bogus_comment (i)
TagName = Namematch.group (). Lower ()
# consume and ignore other stuff between the name and the >
# note:this is not 100% correct, since we might has things like
# </tag attr= ">", but looking for > after tha name should cover
# Most of the cases and is much simpler
Gtpos = Rawdata.find (' > ', Namematch.end ())
Self.handle_endtag (tagname)
Return gtpos+1

Elem = Match.group (1). Lower () # script or style
If Self.cdata_elem is not None:
If Elem! = Self.cdata_elem:
Self.handle_data (Rawdata[i:gtpos])
Return Gtpos

Self.handle_endtag (Elem)
Self.clear_cdata_mode ()
Return Gtpos

# Overridable--Finish processing of start+end tag: <tag.../>
def handle_startendtag (self, Tag, attrs):
Self.handle_starttag (tag, attrs)
Self.handle_endtag (TAG)

# Overridable--Handle start tag
def handle_starttag (self, Tag, attrs):
Pass

# Overridable--Handle end tag
def handle_endtag (self, Tag):
Pass

# Overridable--Handle character reference
def handle_charref (self, name):
Pass

# Overridable--Handle entity reference
def handle_entityref (self, name):
Pass

# Overridable--Handle data
def handle_data (self, data):
Pass

# Overridable--Handle Comment
def handle_comment (self, data):
Pass

# Overridable--Handle declaration
def handle_decl (self, decl):
Pass

# Overridable--Handle processing instruction
def handle_pi (self, data):
Pass

def unknown_decl (self, data):
Pass

# Internal--Helper to remove special character quoting
Entitydefs = None
def unescape (self, s):
If ' & ' not in S:
return s
def replaceentities (s):
s = s.groups () [0]
Try
If s[0] = = "#":
s = s[1:]
If s[0] in [' X ', ' X ']:
c = Int (s[1:], 16)
Else
c = Int (s)
return Chr (c)
Except ValueError:
Return ' +s+ ';
Else
# cannot use Name2codepoint directly, because Htmlparser supports APOs,
# which is not part of HTML 4
Import Html.entities as Htmlentitydefs
If Htmlparser.entitydefs is None:
Entitydefs = Htmlparser.entitydefs = {' apos ': U ' "}
For K, V in Htmlentitydefs.name2codepoint.items ():
ENTITYDEFS[K] = Chr (v)
Try
return Self.entitydefs[s]
Except Keyerror:
Return ' & ' +s+ '; '

Return Re.sub (r "& (#?[ XX]? (?: [0-9a-fa-f]+|\w{1,8})); ", Replaceentities, s)

Python turns the HTML entity back

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.