Reprinted from https://gist.github.com/859717133. HTML tags are simply filtered when used <>
import restr = "<a>srcd</a>hello</br><br/>"str = re.sub(r'</?\w+[^>]*>','',str)print str
# Is there a simpler way to use htmlparser? Regular Expression?
Def strip_tags (HTML): "function for filtering HTML tags in Python >>> str_text = strip_tags (" <font color = Red> Hello </font> ") >>> print str_text hello "From htmlparser import htmlparser html = html. strip () html = html. strip ("\ n") Result = [] parser = htmlparser () parser. handle_data = result. append parser. feed (HTML) parser. close () return ''. join (result)
# More in-depth filtering, similar to instaper or readitlater services, interesting research topics
#-*-Coding: UTF-8-*-import re # filter tags in html # Remove tags and other information in html # @ Param htmlstr HTML string. def filter_tags (htmlstr): # filter CDATA re_cdata = Re. compile ('// <! \ [CDATA \ [[^>] * // \]> ', re. i) # match CDATA re_script = Re. compile ('<\ s * script [^>] *> [^ <] * <\ s */\ s * script \ s *>', re. i) # script re_style = Re. compile ('<\ s * style [^>] *> [^ <] * <\ s */\ s * style \ s *>', re. i) # style re_br = Re. compile ('<br \ s *? /?> ') # Re_h = Re. Compile (' </? \ W + [^>] *> ') # HTML Tag re_comment = Re. Compile (' <! -- [^>] * --> ') # HTML comment s = re_cdata.sub ('', htmlstr) # Remove cdata s = re_script.sub ('', S) # Remove script S = re_style.sub ('', S) # Remove Style S = re_br.sub ('\ n', S) # convert Br to line feed S = re_h.sub ('', s) # Remove the HTML Tag S = re_comment.sub (', S) # Remove the HTML comment # Remove the extra blank line (blank line = Re. compile ('\ n +') S = blank_line.sub ('\ n', S) S = replacecharentity (s) # Replace entity return s # Replace common HTML character entities. # use normal characters to replace special character entities in HTML. # You can add new entity characters to char_entities to process more HTML characters. # @ Param htmlstr HT ML string. def replacecharentity (htmlstr): char_entities = {'nbsp ': '', '000000':'', 'lt': '<', '60': '<', 'gt ':'> ', '62':'> ', 'am':' & ', '38':' & ', 'quot ':'"', '34': '"',} re_charentity = Re. compile (R '&#? (? P <Name> \ W +); ') SZ = re_charentity.search (htmlstr) while SZ: entity = Sz. group () # Full name of entity, such as> key = Sz. group ('name') # Remove &; after entity, for example,> For GT try: htmlstr = re_charentity.sub (char_entities [Key], htmlstr, 1) SZ = re_charentity.search (htmlstr) failed t keyerror: # use an empty string instead of htmlstr = require ('', htmlstr, 1) SZ = re_charentity.search (htmlstr) return htmlstrdef repalce (S, re_exp, repl_string): Return re_exp.sub (repl_string, s) If _ name __= = '_ main _': s0000file('google.htm '). read () News = filter_tags (s) print news
As needed
Blank_line = Re. Compile ('\ n + ')
S = blank_line.sub ('\ n', S) is changed to S = blank_line.sub (', S) to remove all line breaks.
Reprinted from https://gist.github.com/859717