Directly encapsulated into a class, which is quite convenient to use
Using system;
Using system. Data;
Using system. configuration;
Using system. Web;
Using system. Web. Security;
Using system. Web. UI;
Using system. Web. UI. webcontrols;
Using system. Web. UI. webcontrols. webparts;
Using system. Web. UI. htmlcontrols;
Using system. Text. regularexpressions;
/// <Summary>
/// Htmlextract extracts text information from HTML
/// </Summary>
Public class htmlextract
{
# Region private attributes
Private string _ strhtml;
# Endregion
# Region Public Mehtods
Public htmlextract (string syntax HTML)
{_ Strhtml = HTML ;}
Public String extracttext ()
{
String result = _ strhtml;
Result = removecomment (result );
Result = removescript (result );
Result = removestyle (result );
Result = removetags (result );
Return result. Trim ();
}
# Endregion
# Region private methods
Private string removecomment (string input)
{
String result = input;
// Remove comment
Result = RegEx. Replace (result, @ "<! -- [^-] * --> ", String. Empty, regexoptions. ignorecase );
Return result;
}
Private string removestyle (string input)
{
String result = input;
// Remove all styles
Result = RegEx. Replace (result, @ "<style [^>] *?>. *? </Style> ", String. Empty, regexoptions. ignorecase | regexoptions. singleline );
Return result;
}
Private string removescript (string input)
{
String result = input;
Result = RegEx. Replace (result, @ "<SCRIPT [^>] *?>. *? </SCRIPT> ", String. Empty, regexoptions. ignorecase | regexoptions. singleline );
Result = RegEx. Replace (result, @ "<NoScript [^>] *?>. *? </NoScript> ", String. Empty, regexoptions. ignorecase | regexoptions. singleline );
Return result;
}
Private string removetags (string input)
{
String result = input;
Result = result. Replace ("","");
Result = result. Replace ("'","/"");
Result = result. Replace ("<", "<");
Result = result. Replace (">", "> ");
Result = result. Replace ("&","&");
Result = result. Replace ("<br>", "/R/N ");
Result = RegEx. Replace (result, @ "<[/S] *?> ", String. Empty, regexoptions. ignorecase );
Return result;
}
# Endregion
}