HtmlEntities, htmlentitiesphp
#region GetOnlyTextFromHtmlCode + RemoveHtmlChars + RemoveTagFromHtmlCode /// <summary> /// http://www.codeproject.com/script/Content/ViewAssociatedFile.aspx?rzp=%2FKB%2Fedit%2FZetaHtmlEditControl%2F%2FZetaHtmlEditControl-Source.zip&zep=Control%2FHtmlEditControl.cs&obid=43954&obtid=2&ovid=13 /// </summary> /// <param name="htmlCode"></param> /// <returns></returns> private static string getOnlyTextFromHtmlCode(string htmlCode) { //<br> htmlCode = htmlCode.Replace("\r\n", @" "); htmlCode = htmlCode.Replace("\r", @" "); htmlCode = htmlCode.Replace("\n", @" "); htmlCode = htmlCode.Replace(@"</p>", Environment.NewLine + Environment.NewLine); htmlCode = htmlCode.Replace(@"</P>", Environment.NewLine + Environment.NewLine); //html comment htmlCode = Regex.Replace( htmlCode, @"<!--.*?-->", string.Empty, RegexOptions.Singleline | RegexOptions.IgnoreCase); //<p> htmlCode = Regex.Replace(htmlCode, @"<br[^>]*>", Environment.NewLine, RegexOptions.Singleline | RegexOptions.IgnoreCase); //tags htmlCode = removeTagFromHtmlCode(@"style", htmlCode); htmlCode = removeTagFromHtmlCode(@"script", htmlCode); //html htmlCode = Regex.Replace( htmlCode, "<(.|\n)+?>", string.Empty, RegexOptions.Singleline | RegexOptions.IgnoreCase); //umlaute htmlCode = unescapeHtmlEntities(htmlCode); //whitespaces htmlCode = Regex.Replace( htmlCode, @" +", @" ", RegexOptions.Singleline | RegexOptions.IgnoreCase); return htmlCode; } /// <summary> /// http://dev.w3.org/html5/html-author/charref /// </summary> /// <param name="htmlCode"></param> /// <returns></returns> private static string unescapeHtmlEntities(string htmlCode) {
HtmlCode = htmlCode. Replace (@ "& nbsp ;",@"");
HtmlCode = htmlCode. Replace (@ "& Auml;", @ "ä ");
HtmlCode = htmlCode. Replace (@ "& absp ;",@"");
HtmlCode = htmlCode. Replace (@ "& obsp ;",@"");
HtmlCode = htmlCode. Replace (@ "& Obsp ;",@"");
HtmlCode = htmlCode. Replace (@ "& ubsp ;",@"");
HtmlCode = htmlCode. Replace (@ "& Ubsp ;",@"");
HtmlCode = htmlCode. Replace (@ "& szlig;", @ "success ");
HtmlCode = htmlCode. Replace (@ "& pound;", @ "£ ");
HtmlCode = htmlCode. Replace (@ "& sect ;",@"§");
HtmlCode = htmlCode. Replace (@ "& copy ;",@"©");
HtmlCode = htmlCode. Replace (@ "& reg ;",@"®");
HtmlCode = htmlCode. Replace (@ "& micro;", @ "µ ");
HtmlCode = htmlCode. Replace (@ "& para;", @ "success ;",@"¶");
HtmlCode = htmlCode. Replace (@ "& Oslash;", @ "Ø ");
HtmlCode = htmlCode. Replace (@ "& oslash;", @ "Ø ");
HtmlCode = htmlCode. Replace (@ "& divide;", @ "reply ");
HtmlCode = htmlCode. Replace (@ "& times;", @ "× ");
return htmlCode; } private static string removeTagFromHtmlCode( string tag, string htmlCode) { return Regex.Replace( htmlCode, string.Format(@"<{0}.*?</{1}>", tag, tag), string.Empty, RegexOptions.Singleline | RegexOptions.IgnoreCase); } #endregion