In a recent example, I want to remove all HTML formats of the stored string content, that is, only plain text,
I checked some related information on the Internet and the usage of regular expressions,
One method is recommended to everyone. Basically, apart from the HTML format that I want to remove, you may need to write the relevant Regular Expression matching based on your needs, here I add a match to remove all spaces,
Next we will post this method to extract plain text and learn from you. Please give me more instructions.
See network materials: http://www.suzhou35.com/blog/article.asp? Id = 169
1,[Use regular expressions to extract text in HTML]
The namespace to be referenced is system. Text. regularexpressions.
Using system. Text. regularexpressions;
Ideas:
A. Remove all spaces and line breaks in HTML text first (because spaces and line breaks in HTML are ignored)
B. Remove all content in the C. Remove all content in the <SCRIPT> tag
D. Remove all content in the <style> flag.
E. Replace TD with space, TR, Li, BR, P, and other tokens with line breaks.
F. Remove all the headers and tails marked with the "<>" symbol.
G, conversion &, & NBPs; and other escape characters are replaced with corresponding symbols
H. Remove unnecessary spaces and empty rows.
The method is as follows:
//
// convert HTML content to plain text, remove the HTML format
///
///
///
Public static string converttotext (string source)
{< br> string result; // Remove line breaks, tabs
Result = source. Replace ("\ r ","");
Result = result. Replace ("\ n ","");
Result = result. Replace ("\ t ","");
// Remove the header
Result = RegEx. Replace (result, "(
result = RegEx. replace (result, @ "<() * script ([^>]) *>", ") ", String. empty, regexoptions. ignorecase);
// remove all styles
result = RegEx. replace (result, @ "<() * style ([^>]) *>", ") ", String. empty, regexoptions. ignorecase);
// insert tabs in spaces of tags
result = RegEx. replace (result, @ "<() * TD ([^>]) *>", "", regexoptions. ignorecase);
// insert line breaks in places of
and
tags
result = RegEx. replace (result, @ "<() * Br () *>", "\ r", regexoptions. ignorecase);
result = RegEx. replace (result, @ "<() * Li () *>", "\ r", regexoptions. ignorecase);
// insert line paragraphs in places of and
tags
result = RegEx. replace (result, @ "<() * tr ([^>]) *>", "\ r", regexoptions. ignorecase);
result = RegEx. replace (result, @ "<() * P ([^>]) *>", "\ r", regexoptions. ignorecase);
// remove anything thats enclosed inside <>
result = RegEx. replace (result, @ "<[^>] *>", String. empty, regexoptions. ignorecase);
// Replace special characters:
result = RegEx. replace (result, @ "&", "&", regexoptions. ignorecase);
result = RegEx. replace (result, @ "", "", regexoptions. ignorecase);
result = RegEx. replace (result, @ "<", "<", regexoptions. ignorecase);
result = RegEx. replace (result, @ ">", ">", regexoptions. ignorecase);
result = RegEx. replace (result ,@"&(. {2, 6}); ", String. empty, regexoptions. ignorecase);
// Remove extra line breaks and tabs
Result = RegEx. Replace (result, @ "() + ","");
Result = RegEx. Replace (result, "(\ r) () + (\ r)", "\ r ");
Result = RegEx. Replace (result, @ "(\ r) +", "\ r \ n ");
// Remove Blank
Result = RegEx. Replace (result, @ "\ s ","");
Return result;
}
2 ,[Retrieve the image address in the text]
Program code
Public
Static String getimgurl ( String htmlstr )
{
String Str=String.Empty;
String spattern=@"^ ] *>";
RegEx R= NewRegEx(@"] * \ s * SRC \ s * = \ s * ([']?) (? <URL> \ s + )'? [^>] *>",
Regexoptions.Compiled);
Match m=R.Match(Htmlstr.Tolower());
If (M.Success)
Str=M.Result("$ {URL }");
ReturnStr;
}