/*** Remove HTML tags from text * *@paraminputstring *@return */ public Staticstring Html2text (string Inputstring) {if(stringutils.isempty (inputstring)) {return NULL; } String htmlstr=inputstring; String Textstr= ""; Java.util.regex.Pattern p_script; Java.util.regex.Matcher m_script; Java.util.regex.Pattern p_style; Java.util.regex.Matcher m_style; Java.util.regex.Pattern p_html; Java.util.regex.Matcher m_html; Java.util.regex.Pattern p_html1; Java.util.regex.Matcher m_html1; Try{String Regex_script= "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>";//define a regular expression for script {or <script[^>]*?>[\\s\\S]*?<\\/script>// }String Regex_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>";//a regular expression that defines a style {or <style[^>]*?>[\\s\\S]*?<\\/style>// }String regex_html = "<[^>]+>";//Regular expressions that define HTML tagsString REGEX_HTML1 = "<[^>]+"; P_script=pattern.compile (regex_script, pattern.case_insensitive); M_script=P_script.matcher (htmlstr); Htmlstr= M_script.replaceall ("");//Filter Script TagsP_style=Pattern. Compile (regex_style, pattern.case_insensitive); M_style=P_style.matcher (htmlstr); Htmlstr= M_style.replaceall ("");//Filter Style Labelsp_html=pattern.compile (regex_html, pattern.case_insensitive); M_html=P_html.matcher (htmlstr); Htmlstr= M_html.replaceall ("");//Filter HTML TagsP_HTML1=Pattern. Compile (regex_html1, pattern.case_insensitive); M_HTML1=P_html1.matcher (htmlstr); Htmlstr= M_html1.replaceall ("");//Filter HTML TagsTextstr=htmlstr; //Replacement &nbsp;Textstr = Textstr.replaceall ("&", ""). replaceall ("nbsp;", ""); } Catch(Exception e) {System.err.println ("html2text:" +E.getmessage ()); } returntextstr;//returns a text string}
Java uses regular expressions to filter tags in html