This example describes the C # implementation of the method of converting HTML to plain text. Share to everyone for your reference. Specific as follows:
How to use:
Htmltotext convert = new Htmltotext (); TextBox2.Text = Convert. Convert (TextBox1.Text);
The C # code is as follows:
<summary>///converts HTML to plain text.///</summary>class htmltotext{//Static data tables protected Static dictionary<string, string> _tags; protected static hashset<string> _ignoretags; Instance variables protected Textbuilder _text; protected string _html; protected int _pos; Static constructor (one time only) static Htmltotext () {_tags = new dictionary<string, string> (); _tags. ADD ("Address", "\ n"); _tags. ADD ("blockquote", "\ n"); _tags. Add ("div", "\ n"); _tags. ADD ("DL", "\ n"); _tags. ADD ("fieldset", "\ n"); _tags. ADD ("form", "\ n"); _tags. ADD ("H1", "\ n"); _tags. ADD ("/h1", "\ n"); _tags. ADD ("H2", "\ n"); _tags. ADD ("/h2", "\ n"); _tags. ADD ("H3", "\ n"); _tags. ADD ("/h3", "\ n"); _tags. ADD ("H4", "\ n"); _tags. ADD ("/h4", "\ n"); _tags. Add ("h5", "\ n"); _tags. ADD ("/h5", "\ n"); _tags. Add ("h6", "\ n"); _tags. ADD ("/h6", "\ n"); _tags. ADD ("P", "\ n"); _tags. ADD ("/P", "\ n"); _tags. ADD ("tabLe "," \ n "); _tags. ADD ("/table", "\ n"); _tags. ADD ("ul", "\ n"); _tags. ADD ("/ul", "\ n"); _tags. ADD ("ol", "\ n"); _tags. ADD ("/ol", "\ n"); _tags. ADD ("/li", "\ n"); _tags. ADD ("br", "\ n"); _tags. ADD ("/td", "\ T"); _tags. ADD ("/tr", "\ n"); _tags. ADD ("/pre", "\ n"); _ignoretags = new hashset<string> (); _ignoretags.add ("script"); _ignoretags.add ("NoScript"); _ignoretags.add ("style"); _ignoretags.add ("Object"); }///<summary>//Converts the given HTML to plain text and returns the result. </summary>//<param name= "html" >html to be converted</param>//<returns>resulting Plain Text</returns> public string Convert (string html) {//Initialize state variables _text = new Textbuilder (); _html = html; _pos = 0; Process input while (! Endoftext) {if (Peek () = = < ') {//HTML tag bool selfclosing; string tag = Parsetag (out selfclosing); HAndle Special Tag Cases if (tag = = "Body") {//Discard content before <body> _text. Clear (); } else if (tag = = "/body") {//Discard content after </body> _pos = _html. Length; } else if (tag = = "Pre") {//Enter preformatted mode _text. Preformatted = true; Eatwhitespacetonextline (); } else if (tag = = "/pre") {//Exit preformatted mode _text. preformatted = false; } string value; if (_tags. TryGetValue (tag, out value)) _text. Write (value); if (_ignoretags.contains (tag)) eatinnercontent (tag); } else if (Char.iswhitespace (Peek ())) {//whitespace (treat all as space) _text. Write (_text. Preformatted? Peek (): "); Moveahead (); } else {//other text _text. Write (Peek ()); Moveahead (); }}//ReturnResult return Httputility.htmldecode (_text. ToString ()); }//Eats All characters that is part of the current tag//and returns information about that tag protected string Pa Rsetag (out bool selfclosing) {string tag = String.Empty; Selfclosing = false; if (Peek () = = ' < ') {moveahead (); Parse tag name Eatwhitespace (); int start = _pos; if (Peek () = = '/') moveahead (); while (! Endoftext &&! Char.iswhitespace (Peek ()) && peek ()! = '/' && peek ()! = ' > ') moveahead (); Tag = _html. Substring (Start, _pos-start). ToLower (); Parse Rest of Tag while (! Endoftext && peek ()! = ' > ') {if (Peek () = = ' "' | | Peek () = = ' \ ') eatquotedvalue (); else {if (Peek () = = '/') selfclosing = true; Moveahead (); }} moveahead (); } return tag; }//consumes inner content from the current tag protected void eatinnercontent (String tag) {string endtag = "/" + tag; while (! Endoftext) {if (Peek () = = < ') {//consume a tag bool selfclosing; if (Parsetag (out selfclosing) = = Endtag) return; Use recursion to consume nested tags if (!selfclosing &&!tag. StartsWith ("/")) eatinnercontent (tag); } else Moveahead (); }}//Returns True if the current position are at the end of//the string protected bool Endoftext {get {return (_pos >= _html. Length); }}//Safely returns the character at the current position protected char Peek () {return (_pos < _html. Length)? _html[_pos]: (char) 0; }//Safely advances to current position to the next character protected void Moveahead () {_pos = Math.min (_pos + 1 , _html. Length); }//Moves the current position to the next Non-whitespace//character. protected void Eatwhitespace () {while (Char.iswhitespace (Peek ())) Moveahead (); }//Moves the current position to the next Non-whitespace//character or the start of the next line, whichever//COM ES first protected void Eatwhitespacetonextline () {while (Char.iswhitespace (Peek ())) {Char c = peek (); Moveahead (); if (c = = ' \ n ') break; }}//Moves the current position past a quoted value protected void Eatquotedvalue () {char c = Peek (); if (c = = ' "' | | c = = ' \ ') {//Opening quote moveahead (); Find end of value int start = _pos; _pos = _html. IndexOfAny (new char[] {c, ' \ R ', ' \ n '}, _pos); if (_pos < 0) _pos = _html. Length; else Moveahead (); Closing Quote}}///<summary>//A StringBuilder class that helps eliminate excess whitespace. </summary> protected class Textbuilder {private StringBuilder _text; Private StringBuilder _currline; private int _emptylines; private bool _preformatted; ConstRuction public Textbuilder () {_text = new StringBuilder (); _currline = new StringBuilder (); _emptylines = 0; _preformatted = false; }///<summary>//Normally, extra whitespace characters is discarded. They is passed//through unchanged. </summary> public bool Preformatted {get {return _preformatted; } set {if (value) {//Clear line buffer if changing to//preformatted mode if (_currline.length > 0) flushcurrline (); _emptylines = 0; } _preformatted = value; }}///<summary>//Clears all current text. </summary> public void Clear () {_text. Length = 0; _currline.length = 0; _emptylines = 0; }///<summary>//writes the given string to the output buffer. </summary>//<paramName= "s" ></param> public void Write (string s) {foreach (char c in s) write (c); }///<summary>//writes the given character to the output buffer. </summary>//<param name= "C" >character to write</param> public void write (char c) { if (_preformatted) {//Write preformatted character _text. Append (c); } else {if (c = = ' \ r ') {//Ignore carriage returns. We ' ll process/\ \ n ' If it comes next} else if (c = = ' \ n ') {//Flush current Lin e flushcurrline (); } else if (Char.iswhitespace (c)) {//Write single space character int len = _currline.len Gth if (len = = 0 | |! Char.iswhitespace (_currline[len-1]) _currline.append ("); } else {//ADD character to _currline.append (c); } } }//Appends the current line to output buffer protected void Flushcurrline () {//Get current Line St Ring line = _currline.tostring (). Trim (); Determine if line contains non-space characters string tmp = line. Replace ("", String.Empty); if (TMP. Length = = 0) {//an empty line _emptylines++; if (_emptylines < 2 && _text. Length > 0) _text. Appendline (line); } else {//A non-empty line _emptylines = 0; _text. Appendline (line); }//Reset current line _currline.length = 0; }///<summary>//Returns The current output as a string. </summary> public override string ToString () {if (_currline.length > 0) flushcurrline (); Return _text. ToString (); } }}
I hope this article is helpful to everyone's C # programming.
More C # Implementing methods for converting HTML to plain text please follow topic.alibabacloud.com!