Key code See red Section
#-*-coding:utf-8-*-Import re# #过滤HTML中的标签 #将HTML中标签等信息去掉 # @param htmlstr HTML string.defFilter_tags (HTMLSTR): #先过滤CDATA re_cdata=re.compile ('//<!\[cdata\[[^>]*//\]\]> ', re. I) #匹配CDATA re_script=re.compile (' <\s*script[^>]*>[^<]*<\s*/\s*script\s*> ', re. I) #Script re_style=re.compile (' <\s*style[^>]*>[^<]*<\s*/\s*style\s*> ', re. I) #stylere_br=re.compile (' <br\s*?/?> ') #处理换行Re_h=re.compile (' </?\w+[^>]*> ') #HTML标签 re_comment=re.compile (' <!--[^>]*--> ') #HTML注释 s=re_cdata . Sub (', htmlstr) #去掉CDATA s=re_script.sub (', s) #去掉SCRIPT s=re_style.sub (', s) #去掉styles=re_br.sub (' \ n ', s) #将br转换为换行S=re_h.sub (', s) #去掉HTML label S=re_comment.sub (', s) #去掉HTML注释 #去掉多余的空行 blank_line=re.compile (' \n+ ') s=blank_line . Sub (' \ n ', s) s=replacecharentity (s) #替换实体 returnS # #替换常用HTML字符实体.
#使用正常的字符替换HTML中特殊的字符实体.
#你可以添加新的实体字符到CHAR_ENTITIES中 to handle more HTML character entities. # @param htmlstr HTML string.defReplacecharentity (HTMLSTR): char_entities={' nbsp ': ', ' 160 ': ', ' lt ': ' < ', ' a ': ' < ', ' GT ': ' > ', ' ', ': ' > ', ' Amp ': ' & ', ' R ': ' & ', ' quot ': ' ', ' re_charentity=re.compile ': ' ',} &#?
p<name>\w+); ') Sz=re_charentity.search (HTMLSTR) whileSz:entity=sz.group () #entity全称, such as > Key=sz.group (' name ') #去除 &; entity, such as > for GTTry: Htmlstr=re_charentity.sub (char_entities[key],htmlstr,1) sz=re_charentity.search (HTMLSTR)exceptKeyerror: #以空串代替 htmlstr=re_charentity.sub (', htmlstr,1) Sz=re_charentity.search (HTMLSTR) returnHtmlstrdefRepalce (s,re_exp,repl_string): returnRe_exp.sub (Repl_string,s)if__name__== ' __main__ ': s=file (' google.htm '). Read () news=filter_tags (s)PrintNews