C # turns HTML into plain text

Source: Internet
Author: User
Tags return tag
////// converts HTML to plain text.///Class htmltotext{//static data tables protected static Dictionary
 
  _tags; protected static HashSet
  
   _ignoretags;    Instance variables protected Textbuilder _text;    protected string _html;     protected int _pos; Static constructor (one time only) static Htmltotext () {_tags = new Dictionary
   
    
(); _tags.        ADD ("Address", "\ n"); _tags.        ADD ("blockquote", "\ n"); _tags.        Add ("div", "\ n"); _tags.        ADD ("DL", "\ n"); _tags.        ADD ("fieldset", "\ n"); _tags.        ADD ("form", "\ n"); _tags.        ADD ("H1", "\ n"); _tags.        ADD ("/h1", "\ n"); _tags.        ADD ("H2", "\ n"); _tags.        ADD ("/h2", "\ n"); _tags.        ADD ("H3", "\ n"); _tags.        ADD ("/h3", "\ n"); _tags.        ADD ("H4", "\ n"); _tags.        ADD ("/h4", "\ n"); _tags.        Add ("h5", "\ n"); _tags.        ADD ("/h5", "\ n"); _tags.        Add ("h6", "\ n"); _tags.        ADD ("/h6", "\ n"); _tags.        ADD ("P", "\ n"); _tags.        ADD ("/P", "\ n"); _tags.        ADD ("table", "\ n"); _tags.        ADD ("/table", "\ n"); _tags.        ADD ("ul", "\ n"); _tags.        ADD ("/ul", "\ n"); _tags.        ADD ("ol", "\ n"); _tags.        ADD ("/ol", "\ n"); _tags.        ADD ("/li", "\ n"); _tags.        ADD ("br", "\ n"); _tags.        ADD ("/td", "\ T"); _tags.        ADD ("/tr", "\ n"); _tags.         ADD ("/pre", "\ n"); _ignoretags = NEW HashSet 
    
     ();        _ignoretags.add ("script");        _ignoretags.add ("NoScript");        _ignoretags.add ("style");    _ignoretags.add ("Object"); }     ///
     // Converts the given HTML to plain text and returns the result. /// ///
     HTML to be converted///
     
      
       resulting plain text
      
     public string Convert (string html) {//Initialize state variables _text = new Textbuilder ();        _html = html;         _pos = 0; Process input while (! Endoftext) {if (Peek () = = < ') {//HTML tag bool Selfclosi                Ng                 string tag = Parsetag (out selfclosing); Handle Special Tag Cases if (tag = = "Body") {//Discard content befor E                    _text.                Clear ();  } else if (tag = = "/body") {//Discard content after                    _pos = _html.                Length;                    } else if (tag = = "Pre") {//Enter preformatted mode _text.                    Preformatted = true;                Eatwhitespacetonextline ();                    } else if (tag = = "/pre") {//Exit preformatted mode _text.                preformatted = false;                } string value; if (_tags. TryGetValue (tag, out value)) _text.                 Write (value);            if (_ignoretags.contains (tag)) eatinnercontent (tag);                } else if (Char.iswhitespace (Peek ())) {//whitespace (treat all as space) _text. Write (_text. Preformatted?                Peek (): ");            Moveahead (); } else {//other text _text.                Write (Peek ());        Moveahead ();    }}//Return result return Httputility.htmldecode (_text.    ToString ()); }//Eats All characters that is part of the current tag//and returns information about that tag protected St        Ring Parsetag (out bool selfclosing) {string tag = String.Empty;         Selfclosing = false;             if (Peek () = = ' < ') {moveahead ();            Parse tag name Eatwhitespace ();            int start = _pos;            if (Peek () = = '/') moveahead (); while (! Endoftext &&!             Char.iswhitespace (Peek ()) && peek ()! = '/' && peek ()! = ' > ') moveahead (); Tag = _html. Substring (Start, _pos-start).             ToLower (); Parse Rest of Tag while (! Endoftext && peek ()! = ' > ') {if (Peek () = = ' "' | |                Peek () = = ' \ ') eatquotedvalue ();           else {         if (Peek () = = '/') selfclosing = true;                Moveahead ();        }} moveahead ();    } return tag; }//consumes inner content from the current tag protected void eatinnercontent (String tag) {string Endt         AG = "/" + tag; while (! Endoftext) {if (Peek () = = < ') {//consume a tag bool self                Closing;                if (Parsetag (out selfclosing) = = Endtag) return; Use recursion to consume nested tags if (!selfclosing &&!tag.            StartsWith ("/")) eatinnercontent (tag);        } else Moveahead ();        }}//Returns True if the current position are at the end of//the string protected bool Endoftext { get {return (_pos >= _html. Length);   }}//Safely returns the character at the current position protected char Peek () {return (_pos < _html. Length)?    _html[_pos]: (char) 0; }//Safely advances to current position to the next character protected void Moveahead () {_pos = MATH.M In (_pos + 1, _html.    Length);    }//Moves the current position to the next Non-whitespace//character.    protected void Eatwhitespace () {while (Char.iswhitespace (Peek ())) Moveahead ();    }//Moves the current position to the next Non-whitespace//character or the start of the next line, whichever            Comes first protected void Eatwhitespacetonextline () {while (Char.iswhitespace (Peek ())) {            char C = Peek ();            Moveahead ();        if (c = = ' \ n ') break; }}//Moves the current position past a quoted value protected void Eatquotedvalue () {char c = Peek (        ); if (c = = ' "' | |  c = = ' \ ') {//Opening quote moveahead ();          Find end of value int start = _pos; _pos = _html.            IndexOfAny (new char[] {c, ' \ R ', ' \ n '}, _pos); if (_pos < 0) _pos = _html.            Length;    else Moveahead ();  Closing Quote}}///
     // A StringBuilder class that helps eliminate excess whitespace. /// Protected class Textbuilder {private StringBuilder _text;        Private StringBuilder _currline;        private int _emptylines;         private bool _preformatted;            Construction public Textbuilder () {_text = new StringBuilder ();            _currline = new StringBuilder ();            _emptylines = 0;        _preformatted = false; }         ///
     /// Normally, extra whitespace characters is discarded. They is passed//through unchanged. /// public bool Preformatted {get {return _preformatted; } set {if (value) {//Clear line buffer if Changi ng to//preformatted mode if (_currline.length > 0) Flush                    Currline ();                _emptylines = 0;            } _preformatted = value; }        }         ///
     ///Clears all current text. /// public void Clear () {_text.            Length = 0;            _currline.length = 0;        _emptylines = 0; }         ///
     // writes the given string to the output buffer. /// ///
     public void Write (string s) {foreach (char C-in s) write (c); }         ///
     /// writes the given character to the output buffer. /// ///
     Character to write        public void Write (char c) {if (_preformatted) {//write preformatted Character _text.            Append (c); } else {if (c = = ' \ r ') {//Ignore carriage return                S. We ' ll process//' \ n ' If it comes next} else if (c = = ' \ n ')                {//Flush current line flushcurrline ();                    } else if (Char.iswhitespace (c)) {//Write single space character                    int len = _currline.length; if (len = = 0 | |!                Char.iswhitespace (_currline[len-1]) _currline.append ("); } else {//ADD character to _currline.ap                Pend (c); }            }        }         //Appends the current line to output buffer protected void Flushcurrline () {//Get current line String line = _currline.tostring ().             Trim (); Determine if line contains non-space characters string tmp = line.            Replace ("", String.Empty); if (TMP.                Length = = 0) {//an empty line _emptylines++; if (_emptylines < 2 && _text. Length > 0) _text.            Appendline (line);                } else {//A non-empty line _emptylines = 0; _text.            Appendline (line);        }//Reset current line _currline.length = 0;  }         ///
     //Returns The current output as a string. /// public override string ToString () {if (_currline.length > 0) flushcurrline (); Return _text.        ToString (); }    }}
    
   
  
 
  • Related Article

    Contact Us

    The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

    If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.