C # implements the method of converting HTML into plain text

Source: Internet
Author: User
Tags return tag
This example describes the C # implementation of the method of converting HTML to plain text. Share to everyone for your reference. Specific as follows:

How to use:

Htmltotext convert = new Htmltotext (); TextBox2.Text = Convert. Convert (TextBox1.Text);

The C # code is as follows:

<summary>///converts HTML to plain text.///</summary>class htmltotext{//Static data tables protected  Static dictionary<string, string> _tags;  protected static hashset<string> _ignoretags;  Instance variables protected Textbuilder _text;  protected string _html;  protected int _pos;    Static constructor (one time only) static Htmltotext () {_tags = new dictionary<string, string> (); _tags.    ADD ("Address", "\ n"); _tags.    ADD ("blockquote", "\ n"); _tags.    Add ("div", "\ n"); _tags.    ADD ("DL", "\ n"); _tags.    ADD ("fieldset", "\ n"); _tags.    ADD ("form", "\ n"); _tags.    ADD ("H1", "\ n"); _tags.    ADD ("/h1", "\ n"); _tags.    ADD ("H2", "\ n"); _tags.    ADD ("/h2", "\ n"); _tags.    ADD ("H3", "\ n"); _tags.    ADD ("/h3", "\ n"); _tags.    ADD ("H4", "\ n"); _tags.    ADD ("/h4", "\ n"); _tags.    Add ("h5", "\ n"); _tags.    ADD ("/h5", "\ n"); _tags.    Add ("h6", "\ n"); _tags.    ADD ("/h6", "\ n"); _tags.    ADD ("P", "\ n"); _tags.    ADD ("/P", "\ n"); _tags. ADD ("tabLe "," \ n "); _tags.    ADD ("/table", "\ n"); _tags.    ADD ("ul", "\ n"); _tags.    ADD ("/ul", "\ n"); _tags.    ADD ("ol", "\ n"); _tags.    ADD ("/ol", "\ n"); _tags.    ADD ("/li", "\ n"); _tags.    ADD ("br", "\ n"); _tags.    ADD ("/td", "\ T"); _tags.    ADD ("/tr", "\ n"); _tags.    ADD ("/pre", "\ n");    _ignoretags = new hashset<string> ();    _ignoretags.add ("script");    _ignoretags.add ("NoScript");    _ignoretags.add ("style");  _ignoretags.add ("Object");  }///<summary>//Converts the given HTML to plain text and returns the result. </summary>//<param name= "html" >html to be converted</param>//<returns>resulting Plain     Text</returns> public string Convert (string html) {//Initialize state variables _text = new Textbuilder ();    _html = html;    _pos = 0; Process input while (!        Endoftext) {if (Peek () = = < ') {//HTML tag bool selfclosing;        string tag = Parsetag (out selfclosing); HAndle Special Tag Cases if (tag = = "Body") {//Discard content before <body> _text.        Clear (); } else if (tag = = "/body") {//Discard content after </body> _pos = _html.        Length; } else if (tag = = "Pre") {//Enter preformatted mode _text.          Preformatted = true;        Eatwhitespacetonextline (); } else if (tag = = "/pre") {//Exit preformatted mode _text.        preformatted = false;        } string value; if (_tags. TryGetValue (tag, out value)) _text.        Write (value);      if (_ignoretags.contains (tag)) eatinnercontent (tag); } else if (Char.iswhitespace (Peek ())) {//whitespace (treat all as space) _text. Write (_text. Preformatted?        Peek (): ");      Moveahead (); } else {//other text _text.        Write (Peek ());      Moveahead (); }}//ReturnResult return Httputility.htmldecode (_text.  ToString ()); }//Eats All characters that is part of the current tag//and returns information about that tag protected string Pa    Rsetag (out bool selfclosing) {string tag = String.Empty;    Selfclosing = false;      if (Peek () = = ' < ') {moveahead ();      Parse tag name Eatwhitespace ();      int start = _pos;      if (Peek () = = '/') moveahead (); while (! Endoftext &&!      Char.iswhitespace (Peek ()) && peek ()! = '/' && peek ()! = ' > ') moveahead (); Tag = _html. Substring (Start, _pos-start).      ToLower (); Parse Rest of Tag while (! Endoftext && peek ()! = ' > ') {if (Peek () = = ' "' | |        Peek () = = ' \ ') eatquotedvalue ();          else {if (Peek () = = '/') selfclosing = true;        Moveahead ();    }} moveahead ();  } return tag; }//consumes inner content from the current tag protected void eatinnercontent (String tag) {string endtag = "/" + tag; while (!        Endoftext) {if (Peek () = = < ') {//consume a tag bool selfclosing;        if (Parsetag (out selfclosing) = = Endtag) return; Use recursion to consume nested tags if (!selfclosing &&!tag.      StartsWith ("/")) eatinnercontent (tag);    } else Moveahead ();  }}//Returns True if the current position are at the end of//the string protected bool Endoftext {get {return (_pos >= _html. Length); }}//Safely returns the character at the current position protected char Peek () {return (_pos < _html. Length)?  _html[_pos]: (char) 0; }//Safely advances to current position to the next character protected void Moveahead () {_pos = Math.min (_pos + 1 , _html.  Length);  }//Moves the current position to the next Non-whitespace//character.   protected void Eatwhitespace () {while (Char.iswhitespace (Peek ()))   Moveahead (); }//Moves the current position to the next Non-whitespace//character or the start of the next line, whichever//COM      ES first protected void Eatwhitespacetonextline () {while (Char.iswhitespace (Peek ())) {Char c = peek ();      Moveahead ();    if (c = = ' \ n ') break;    }}//Moves the current position past a quoted value protected void Eatquotedvalue () {char c = Peek (); if (c = = ' "' | |      c = = ' \ ') {//Opening quote moveahead ();      Find end of value int start = _pos; _pos = _html.      IndexOfAny (new char[] {c, ' \ R ', ' \ n '}, _pos); if (_pos < 0) _pos = _html.      Length;  else Moveahead ();  Closing Quote}}///<summary>//A StringBuilder class that helps eliminate excess whitespace.    </summary> protected class Textbuilder {private StringBuilder _text;    Private StringBuilder _currline;    private int _emptylines;    private bool _preformatted; ConstRuction public Textbuilder () {_text = new StringBuilder ();      _currline = new StringBuilder ();      _emptylines = 0;    _preformatted = false;    }///<summary>//Normally, extra whitespace characters is discarded.    They is passed//through unchanged.      </summary> public bool Preformatted {get {return _preformatted;          } set {if (value) {//Clear line buffer if changing to//preformatted mode          if (_currline.length > 0) flushcurrline ();        _emptylines = 0;      } _preformatted = value;    }}///<summary>//Clears all current text. </summary> public void Clear () {_text.      Length = 0;      _currline.length = 0;    _emptylines = 0;    }///<summary>//writes the given string to the output buffer. </summary>//<paramName= "s" ></param> public void Write (string s) {foreach (char c in s) write (c);    }///<summary>//writes the given character to the output buffer.      </summary>//<param name= "C" >character to write</param> public void write (char c) { if (_preformatted) {//Write preformatted character _text.      Append (c); } else {if (c = = ' \ r ') {//Ignore carriage returns. We ' ll process/\ \ n ' If it comes next} else if (c = = ' \ n ') {//Flush current Lin        e flushcurrline (); } else if (Char.iswhitespace (c)) {//Write single space character int len = _currline.len          Gth if (len = = 0 | |!        Char.iswhitespace (_currline[len-1]) _currline.append (");        } else {//ADD character to _currline.append (c);   }      } }//Appends the current line to output buffer protected void Flushcurrline () {//Get current Line St Ring line = _currline.tostring ().      Trim (); Determine if line contains non-space characters string tmp = line.      Replace ("", String.Empty); if (TMP.        Length = = 0) {//an empty line _emptylines++; if (_emptylines < 2 && _text. Length > 0) _text.      Appendline (line);        } else {//A non-empty line _emptylines = 0; _text.      Appendline (line);    }//Reset current line _currline.length = 0;    }///<summary>//Returns The current output as a string.      </summary> public override string ToString () {if (_currline.length > 0) flushcurrline (); Return _text.    ToString (); }  }}

I hope this article is helpful to everyone's C # programming.

More C # Implementing methods for converting HTML to plain text please follow topic.alibabacloud.com!

  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.