Public Static stringClearhtml (stringcontent) {Regex regex=NewRegex (""); //first, the properties of the P tag are removed, leaving only <p>Regex =NewRegex (@"<p.*?>", Regexoptions.ignorecase |regexoptions.singleline); Content= Regex. Replace (Content,"<p>"); //find a variety of labels in the Web page for subsequent processingRegex =NewRegex (@"<[/]* (? <txt>.*?) [\s>]", Regexoptions.singleline |regexoptions.ignorecase); List<string> labels =Newlist<string>(); MatchCollection Mclabels=regex. Matches (content); foreach(Match minchmclabels) { if(Labels. Contains (m.groups["txt"]. Value) = =false) labels. ADD (m.groups["txt"]. Value.tolower ()); } //various labels are replaced, except P, IMG, strong. The BR will be processed separately after foreach(stringlableinchlabels) { if(lable=="P"|| lable = ="img"|| lable=="Strong"|| lable=="BR") Continue; Regex=NewRegex (@"<[\/]*"+ lable +".*?>", Regexoptions.ignorecase |regexoptions.singleline); Content= Regex. Replace (Content,""); } MatchCollection MC=NULL; Regex=NewRegex (@".*?) [' ""].*?>", Regexoptions.singleline |regexoptions.ignorecase); MC=regex. Matches (content); foreach(Match minchMC) {content= content. Replace (M.value,""+ m.groups["txt"]. Value+"'/>"); } Regex R=NewRegex (@"<br.*?>", Regexoptions.ignorecase |regexoptions.singleline); Content= R.replace (Content,"\ r \ n"); R=NewRegex (@"[\r\n\t]", Regexoptions.ignorecase |regexoptions.singleline); Content= R.replace (Content,"</p><p>"); Content=content. Trim (); if(content. StartsWith ("</p>") ==true) Content= content. Substring (4); if(content. EndsWith ("<p>") ==true) Content= content. Remove (content. Length-3); //Replace paragraph before space startsRegex =NewRegex (@"<p>\s* ", Regexoptions.ignorecase |regexoptions.singleline); while(regex.) IsMatch (content)) {content= Regex. Replace (Content,@"<p>"); } Regex=NewRegex (@"<p>\s+", Regexoptions.ignorecase |regexoptions.singleline); while(regex.) IsMatch (content)) {content= Regex. Replace (Content,@"<p>"); } Regex=NewRegex (@"<p> +", Regexoptions.ignorecase |regexoptions.singleline); while(regex.) IsMatch (content)) {content= Regex. Replace (Content,@"<p>"); } //end of space before replacing paragraph//Replace empty nesting of P tagsRegex =NewRegex (@"<p>\s*?<p>", Regexoptions.singleline |regexoptions.ignorecase); while(regex.) IsMatch (content)) content= Regex. Replace (Content,"<p>"); Regex=NewRegex (@"<\/p>\s*?<\/p>", Regexoptions.singleline |regexoptions.ignorecase); while(regex.) IsMatch (content)) content= Regex. Replace (Content,@"</p>"); //case where the P label contents are emptyRegex =NewRegex (@"<p> (? <txt>.*?) </p>", Regexoptions.singleline |regexoptions.ignorecase); MC=regex. Matches (content); foreach(Match minchMC) {stringValue = m.groups["txt"]. Value; Value= value. Replace (" ",""). Trim (); if(string. IsNullOrEmpty (value) = =true) Content= content. Replace (M.value,""); } //paragraph first plus spaceContent = content. Replace ("<p>","<p>"); returncontent; }
Remove the other than P, IMG, strong and other tags, p, img Various properties have also been cleared, specifically for the generation of Clean Web page body, can be used for information after the collection of content and format layout. Self-code, the algorithm may not be efficient, but enough to meet the current needs.
Page formatting layout code, the content after the collection of special information