////// converts HTML to plain text.///Class htmltotext{//static data tables protected static Dictionary
_tags; protected static HashSet
_ignoretags; Instance variables protected Textbuilder _text; protected string _html; protected int _pos; Static constructor (one time only) static Htmltotext () {_tags = new Dictionary
(); _tags. ADD ("Address", "\ n"); _tags. ADD ("blockquote", "\ n"); _tags. Add ("div", "\ n"); _tags. ADD ("DL", "\ n"); _tags. ADD ("fieldset", "\ n"); _tags. ADD ("form", "\ n"); _tags. ADD ("H1", "\ n"); _tags. ADD ("/h1", "\ n"); _tags. ADD ("H2", "\ n"); _tags. ADD ("/h2", "\ n"); _tags. ADD ("H3", "\ n"); _tags. ADD ("/h3", "\ n"); _tags. ADD ("H4", "\ n"); _tags. ADD ("/h4", "\ n"); _tags. Add ("h5", "\ n"); _tags. ADD ("/h5", "\ n"); _tags. Add ("h6", "\ n"); _tags. ADD ("/h6", "\ n"); _tags. ADD ("P", "\ n"); _tags. ADD ("/P", "\ n"); _tags. ADD ("table", "\ n"); _tags. ADD ("/table", "\ n"); _tags. ADD ("ul", "\ n"); _tags. ADD ("/ul", "\ n"); _tags. ADD ("ol", "\ n"); _tags. ADD ("/ol", "\ n"); _tags. ADD ("/li", "\ n"); _tags. ADD ("br", "\ n"); _tags. ADD ("/td", "\ T"); _tags. ADD ("/tr", "\ n"); _tags. ADD ("/pre", "\ n"); _ignoretags = NEW HashSet
(); _ignoretags.add ("script"); _ignoretags.add ("NoScript"); _ignoretags.add ("style"); _ignoretags.add ("Object"); } ///
// Converts the given HTML to plain text and returns the result. /// ///
HTML to be converted///
resulting plain text
public string Convert (string html) {//Initialize state variables _text = new Textbuilder (); _html = html; _pos = 0; Process input while (! Endoftext) {if (Peek () = = < ') {//HTML tag bool Selfclosi Ng string tag = Parsetag (out selfclosing); Handle Special Tag Cases if (tag = = "Body") {//Discard content befor E _text. Clear (); } else if (tag = = "/body") {//Discard content after _pos = _html. Length; } else if (tag = = "Pre") {//Enter preformatted mode _text. Preformatted = true; Eatwhitespacetonextline (); } else if (tag = = "/pre") {//Exit preformatted mode _text. preformatted = false; } string value; if (_tags. TryGetValue (tag, out value)) _text. Write (value); if (_ignoretags.contains (tag)) eatinnercontent (tag); } else if (Char.iswhitespace (Peek ())) {//whitespace (treat all as space) _text. Write (_text. Preformatted? Peek (): "); Moveahead (); } else {//other text _text. Write (Peek ()); Moveahead (); }}//Return result return Httputility.htmldecode (_text. ToString ()); }//Eats All characters that is part of the current tag//and returns information about that tag protected St Ring Parsetag (out bool selfclosing) {string tag = String.Empty; Selfclosing = false; if (Peek () = = ' < ') {moveahead (); Parse tag name Eatwhitespace (); int start = _pos; if (Peek () = = '/') moveahead (); while (! Endoftext &&! Char.iswhitespace (Peek ()) && peek ()! = '/' && peek ()! = ' > ') moveahead (); Tag = _html. Substring (Start, _pos-start). ToLower (); Parse Rest of Tag while (! Endoftext && peek ()! = ' > ') {if (Peek () = = ' "' | | Peek () = = ' \ ') eatquotedvalue (); else { if (Peek () = = '/') selfclosing = true; Moveahead (); }} moveahead (); } return tag; }//consumes inner content from the current tag protected void eatinnercontent (String tag) {string Endt AG = "/" + tag; while (! Endoftext) {if (Peek () = = < ') {//consume a tag bool self Closing; if (Parsetag (out selfclosing) = = Endtag) return; Use recursion to consume nested tags if (!selfclosing &&!tag. StartsWith ("/")) eatinnercontent (tag); } else Moveahead (); }}//Returns True if the current position are at the end of//the string protected bool Endoftext { get {return (_pos >= _html. Length); }}//Safely returns the character at the current position protected char Peek () {return (_pos < _html. Length)? _html[_pos]: (char) 0; }//Safely advances to current position to the next character protected void Moveahead () {_pos = MATH.M In (_pos + 1, _html. Length); }//Moves the current position to the next Non-whitespace//character. protected void Eatwhitespace () {while (Char.iswhitespace (Peek ())) Moveahead (); }//Moves the current position to the next Non-whitespace//character or the start of the next line, whichever Comes first protected void Eatwhitespacetonextline () {while (Char.iswhitespace (Peek ())) { char C = Peek (); Moveahead (); if (c = = ' \ n ') break; }}//Moves the current position past a quoted value protected void Eatquotedvalue () {char c = Peek ( ); if (c = = ' "' | | c = = ' \ ') {//Opening quote moveahead (); Find end of value int start = _pos; _pos = _html. IndexOfAny (new char[] {c, ' \ R ', ' \ n '}, _pos); if (_pos < 0) _pos = _html. Length; else Moveahead (); Closing Quote}}///
// A StringBuilder class that helps eliminate excess whitespace. /// Protected class Textbuilder {private StringBuilder _text; Private StringBuilder _currline; private int _emptylines; private bool _preformatted; Construction public Textbuilder () {_text = new StringBuilder (); _currline = new StringBuilder (); _emptylines = 0; _preformatted = false; } ///
/// Normally, extra whitespace characters is discarded. They is passed//through unchanged. /// public bool Preformatted {get {return _preformatted; } set {if (value) {//Clear line buffer if Changi ng to//preformatted mode if (_currline.length > 0) Flush Currline (); _emptylines = 0; } _preformatted = value; } } ///
///Clears all current text. /// public void Clear () {_text. Length = 0; _currline.length = 0; _emptylines = 0; } ///
// writes the given string to the output buffer. /// ///
public void Write (string s) {foreach (char C-in s) write (c); } ///
/// writes the given character to the output buffer. /// ///
Character to write public void Write (char c) {if (_preformatted) {//write preformatted Character _text. Append (c); } else {if (c = = ' \ r ') {//Ignore carriage return S. We ' ll process//' \ n ' If it comes next} else if (c = = ' \ n ') {//Flush current line flushcurrline (); } else if (Char.iswhitespace (c)) {//Write single space character int len = _currline.length; if (len = = 0 | |! Char.iswhitespace (_currline[len-1]) _currline.append ("); } else {//ADD character to _currline.ap Pend (c); } } } //Appends the current line to output buffer protected void Flushcurrline () {//Get current line String line = _currline.tostring (). Trim (); Determine if line contains non-space characters string tmp = line. Replace ("", String.Empty); if (TMP. Length = = 0) {//an empty line _emptylines++; if (_emptylines < 2 && _text. Length > 0) _text. Appendline (line); } else {//A non-empty line _emptylines = 0; _text. Appendline (line); }//Reset current line _currline.length = 0; } ///
//Returns The current output as a string. /// public override string ToString () {if (_currline.length > 0) flushcurrline (); Return _text. ToString (); } }}