#-*-coding:utf-8-*-
Import re
# #过滤HTML中的标签
#将HTML中标签等信息去掉
# @param htmlstr HTML string.
def filter_tags (htmlstr):
#先过滤CDATA
Re_cdata=re.compile ('//<!\[cdata\[[^>]*//\]\]> ', re. I) #匹配CDATA
Re_script=re.compile (' <\s*script[^>]*>[^<]*<\s*/\s*script\s*> ', re. I) #Script
Re_style=re.compile (' <\s*style[^>]*>[^<]*<\s*/\s*style\s*> ', re. I) #style
Re_br=re.compile (' <br\s*?/?> ') #处理换行
Re_h=re.compile (' </?\w+[^>]*> ') #HTML标签
Re _comment=re.compile (' <!--[^>]*--> ') #HTML注释
S=re_cdata.sub (", htmlstr) #去掉CDATA
S=re_script.sub (' ', s) #去掉SCRIPT
S=re_style.sub (", s) #去掉style
s=re_br.sub (' \ n ', s) #将br转换为换行
S=re_h.sub (", s) #去掉HTML tags
S=re_comment.sub (', s) #去掉HTML注释
#去掉多余的空行
blank_line=re.compile (' \n+ ')
s=blank_line.sub (' \ n ', s)
S= Replacecharentity (s) #替换实体
return s
# #替换常用HTML字符实体.
#使用正常的字符替换HTML中特殊的字符实体.
#你可以添加新的实体字符到CHAR_ENTITIES中, process more HTML character entities.
# @param htmlstr HTML string.
def replacecharentity (HTMLSTR):
char_entities={' nbsp ': ', ' 160 ': ',
' LT ': ' < ', ' $ ': ' < ',
' GT ': ' > ', ' A ': ' > ',
' Amp ': ' & ', ' $ ': ' & ',
' quot ': ' ' ', ' 34 ': ' ', '}
Re_charentity=re.compile (? p<name>\w+); ')
Sz=re_charentity.search (HTMLSTR)
While SZ:
Entity=sz.group () #entity全称, such as >
Key=sz.group (' name ') #去除 &; after entity, such as > for GT
Try
Htmlstr=re_charentity.sub (char_entities[key],htmlstr,1)
Sz=re_charentity.search (HTMLSTR)
Except Keyerror:
#以空串代替
Htmlstr=re_charentity.sub (", htmlstr,1)
Sz=re_charentity.search (HTMLSTR)
Return HTMLSTR
def repalce (s,re_exp,repl_string):
Return Re_exp.sub (repl_string,s)
If __name__== ' __main__ ':
S=file (' google.htm '). Read ()
News=filter_tags (s)
Print News
Python: Use regular to remove HTML tags (go)