Original reprint Please indicate the source:
Using regular processing, do not know whether there will be performance problems, not too much testing.
At present I have a lot of use beautifulsoup to do this kind of processing.
The HTML entity deals with only some of the commonly used entities.
#-*-coding:utf-8-*-Import re # #过滤HTML中的标签 #将HTML中标签等信息去掉 # @param htmlstr HTML string. def filter_tags (htmlstr): #先过滤CDATA re_cdata=re.compile ('//<!\[cdata\[[^>]*//\]\]> ', re. I) #匹配CDATA re_script=re.compile (' <\s*script[^>]*>[^<]*<\s*/\s*script\s*> ', re. I) #Script re_style=re.compile (' <\s*style[^>]*>[^<]*<\s*/\s*style\s*> ', re. I) #style re_br=re.compile (' <br\s*?/?> ') #处理换行 re_h=re.compile (' </?\w+[^>]*> ') #HTML标签 re_comment=
Re.compile (' <!--[^>]*--> ') #HTML注释 s=re_cdata.sub (', Htmlstr ') #去掉CDATA s=re_script.sub (', s) #去掉SCRIPT S=re_style.sub (', s) #去掉style s=re_br.sub (' \ n ', s) #将br转换为换行 s=re_h.sub (', s) #去掉HTML label S=re_comment.sub (', s) #去 Drop HTML annotation #去掉多余的空行 blank_line=re.compile (' \n+ ') s=blank_line.sub (' \ n ', s) s=replacecharentity (s) #替换实体 ret
Urn S # #替换常用HTML字符实体.
#使用正常的字符替换HTML中特殊的字符实体.
#你可以添加新的实体字符到CHAR_ENTITIES中 to handle more HTML character entities.
# @param htmlstr HTML string. def ReplacechArentity (HTMLSTR): char_entities={' nbsp ': ', ' 160 ': ', ' lt ': ' < ', ' a ': ' < ', ' GT ': ' > ', ' re_charenti ': ' > ', ' amp ': ' & ', ' ' ': ' & ', ' quot ': ' ', ': ', ' Ty=re.compile (R ' &#?
p<name>\w+); ') Sz=re_charentity.search (HTMLSTR) while Sz:entity=sz.group () #entity全称, such as > Key=sz.group (' name ') #去除 &am p;; Post entity, such as > for GT try:htmlstr=re_charentity.sub (char_entities[key],htmlstr,1) Sz=re_charen
Tity.search (HTMLSTR) except Keyerror: #以空串代替 htmlstr=re_charentity.sub (', htmlstr,1) Sz=re_charentity.search (HTMLSTR) return htmlstr def repalce (s,re_exp,repl_string): Return Re_exp.sub (Repl_
string,s) If __name__== ' __main__ ': s=file (' google.htm '). Read () news=filter_tags (s) Print news
Output results:
Google webpage picture map information Video finance more Blog Life Hot List website navigation Calendar Photo Document collaboration Platform Input Method toolbar software Select more Personalized Home | Login
Advanced Search
Use preferences
Language Tools All Web pages Chinese page Simplified Chinese web page
Advertising program-Google encyclopedia-google.com in English2009-Privacy-ICP Certificate b2-20070004 number