C # TextNoHTML,
I don't remember where I saw this. It's quite practical.
/// <Summary> /// Method for converting html text into text content TextNoHTML /// </summary> /// <param name = "Htmlstring"> HTML text value </param> // <returns> </returns> public string TextNoHTML (string Htmlstring) {// Delete the script Htmlstring = Regex. replace (Htmlstring, @ "<script [^>] *?>. *? </Script> "," ", RegexOptions. ignoreCase); // delete HTML Htmlstring = Regex. replace (Htmlstring, @ "<(. [^>] *)> "," ", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "([/r/n]) [/s] +", "", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "-->", "", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "<! --. * "," ", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& (quot | #34);", "/", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& (amp | #38);", "&", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& (lt | #60);", "<", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& (gt | #62);", ">", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& (nbsp | #160);", "", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& (iexcl | #161);", "/xa1", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& (cent |# 162);", "/xa2", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& (pound | #163);", "/xa3", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& (copy | #169);", "/xa9", RegexOptions. ignoreCase); Htmlstring = Regex. replace (Htmlstring, @ "& # (/d +);", "", RegexOptions. ignoreCase); // replace <and> MARK Htmlstring = Htmlstring. replace ("<", ""); Htmlstring = Htmlstring. replace (">", ""); Htmlstring = Htmlstring. replace ("\ r \ n", ""); Htmlstring = Htmlstring. replace ("\ r", ""); Htmlstring = Htmlstring. replace ("\ n", ""); // return the string that removes the html Tag. return Htmlstring ;}
/// <Summary> /// obtain the Img path /// </summary> /// <param name = "htmlText"> Html string text </param> // /<returns> returns the image path as an array </returns> public static string [] GetHtmlImageUrlList (string htmlText) {Regex regImg = new Regex (@ "] *? \ Bsrc [\ s \ t \ r \ n] * = [\ s \ t \ r \ n] * ["']? [\ S \ t \ r \ n] * (? [^ \ s \ t \ r \ n "" '<>] *) [^ <>] *? /? [\ S \ t \ r \ n] *> ", RegexOptions. ignoreCase); // create a MatchCollection object for matches to save the number of matching objects (img tag) MatchCollection matches = regImg. matches (htmlText); int I = 0; string [] sUrlList = new string [matches. count]; // traverse all img Tag objects foreach (Match match in matches) {// obtain the path src of all Img, and save it to the array sUrlList [I ++] = match. groups ["imgUrl"]. value;} return sUrlList ;}