C# 將 HTML 轉成純文字

來源:互聯網
上載者:User
/// /// Converts HTML to plain text./// class HtmlToText{    // Static data tables    protected static Dictionary _tags;    protected static HashSet _ignoreTags;     // Instance variables    protected TextBuilder _text;    protected string _html;    protected int _pos;     // Static constructor (one time only)    static HtmlToText()    {        _tags = new Dictionary();        _tags.Add("address", "\n");        _tags.Add("blockquote", "\n");        _tags.Add("div", "\n");        _tags.Add("dl", "\n");        _tags.Add("fieldset", "\n");        _tags.Add("form", "\n");        _tags.Add("h1", "\n");        _tags.Add("/h1", "\n");        _tags.Add("h2", "\n");        _tags.Add("/h2", "\n");        _tags.Add("h3", "\n");        _tags.Add("/h3", "\n");        _tags.Add("h4", "\n");        _tags.Add("/h4", "\n");        _tags.Add("h5", "\n");        _tags.Add("/h5", "\n");        _tags.Add("h6", "\n");        _tags.Add("/h6", "\n");        _tags.Add("p", "\n");        _tags.Add("/p", "\n");        _tags.Add("table", "\n");        _tags.Add("/table", "\n");        _tags.Add("ul", "\n");        _tags.Add("/ul", "\n");        _tags.Add("ol", "\n");        _tags.Add("/ol", "\n");        _tags.Add("/li", "\n");        _tags.Add("br", "\n");        _tags.Add("/td", "\t");        _tags.Add("/tr", "\n");        _tags.Add("/pre", "\n");         _ignoreTags = new HashSet();        _ignoreTags.Add("script");        _ignoreTags.Add("noscript");        _ignoreTags.Add("style");        _ignoreTags.Add("object");    }     ///     /// Converts the given HTML to plain text and returns the result.    ///     /// HTML to be converted    /// Resulting plain text    public string Convert(string html)    {        // Initialize state variables        _text = new TextBuilder();        _html = html;        _pos = 0;         // Process input        while (!EndOfText)        {            if (Peek() == '<')            {                // HTML tag                bool selfClosing;                string tag = ParseTag(out selfClosing);                 // Handle special tag cases                if (tag == "body")                {                    // Discard content before                     _text.Clear();                }                else if (tag == "/body")                {                    // Discard content after                     _pos = _html.Length;                }                else if (tag == "pre")                {                    // Enter preformatted mode                    _text.Preformatted = true;                    EatWhitespaceToNextLine();                }                else if (tag == "/pre")                {                    // Exit preformatted mode                    _text.Preformatted = false;                }                 string value;                if (_tags.TryGetValue(tag, out value))                    _text.Write(value);                 if (_ignoreTags.Contains(tag))                    EatInnerContent(tag);            }            else if (Char.IsWhiteSpace(Peek()))            {                // Whitespace (treat all as space)                _text.Write(_text.Preformatted ? Peek() : ' ');                MoveAhead();            }            else            {                // Other text                _text.Write(Peek());                MoveAhead();            }        }        // Return result        return HttpUtility.HtmlDecode(_text.ToString());    }     // Eats all characters that are part of the current tag    // and returns information about that tag    protected string ParseTag(out bool selfClosing)    {        string tag = String.Empty;        selfClosing = false;         if (Peek() == '<')        {            MoveAhead();             // Parse tag name            EatWhitespace();            int start = _pos;            if (Peek() == '/')                MoveAhead();            while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&                Peek() != '/' && Peek() != '>')                MoveAhead();            tag = _html.Substring(start, _pos - start).ToLower();             // Parse rest of tag            while (!EndOfText && Peek() != '>')            {                if (Peek() == '"' || Peek() == '\'')                    EatQuotedValue();                else                {                    if (Peek() == '/')                        selfClosing = true;                    MoveAhead();                }            }            MoveAhead();        }        return tag;    }     // Consumes inner content from the current tag    protected void EatInnerContent(string tag)    {        string endTag = "/" + tag;         while (!EndOfText)        {            if (Peek() == '<')            {                // Consume a tag                bool selfClosing;                if (ParseTag(out selfClosing) == endTag)                    return;                // Use recursion to consume nested tags                if (!selfClosing && !tag.StartsWith("/"))                    EatInnerContent(tag);            }            else MoveAhead();        }    }     // Returns true if the current position is at the end of    // the string    protected bool EndOfText    {        get { return (_pos >= _html.Length); }    }     // Safely returns the character at the current position    protected char Peek()    {        return (_pos < _html.Length) ? _html[_pos] : (char)0;    }     // Safely advances to current position to the next character    protected void MoveAhead()    {        _pos = Math.Min(_pos + 1, _html.Length);    }     // Moves the current position to the next non-whitespace    // character.    protected void EatWhitespace()    {        while (Char.IsWhiteSpace(Peek()))            MoveAhead();    }     // Moves the current position to the next non-whitespace    // character or the start of the next line, whichever    // comes first    protected void EatWhitespaceToNextLine()    {        while (Char.IsWhiteSpace(Peek()))        {            char c = Peek();            MoveAhead();            if (c == '\n')                break;        }    }     // Moves the current position past a quoted value    protected void EatQuotedValue()    {        char c = Peek();        if (c == '"' || c == '\'')        {            // Opening quote            MoveAhead();            // Find end of value            int start = _pos;            _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);            if (_pos < 0)                _pos = _html.Length;            else                MoveAhead();    // Closing quote        }    }     ///     /// A StringBuilder class that helps eliminate excess whitespace.    ///     protected class TextBuilder    {        private StringBuilder _text;        private StringBuilder _currLine;        private int _emptyLines;        private bool _preformatted;         // Construction        public TextBuilder()        {            _text = new StringBuilder();            _currLine = new StringBuilder();            _emptyLines = 0;            _preformatted = false;        }         ///         /// Normally, extra whitespace characters are discarded.        /// If this property is set to true, they are passed        /// through unchanged.        ///         public bool Preformatted        {            get            {                return _preformatted;            }            set            {                if (value)                {                    // Clear line buffer if changing to                    // preformatted mode                    if (_currLine.Length > 0)                        FlushCurrLine();                    _emptyLines = 0;                }                _preformatted = value;            }        }         ///         /// Clears all current text.        ///         public void Clear()        {            _text.Length = 0;            _currLine.Length = 0;            _emptyLines = 0;        }         ///         /// Writes the given string to the output buffer.        ///         ///         public void Write(string s)        {            foreach (char c in s)                Write(c);        }         ///         /// Writes the given character to the output buffer.        ///         /// Character to write        public void Write(char c)        {            if (_preformatted)            {                // Write preformatted character                _text.Append(c);            }            else            {                if (c == '\r')                {                    // Ignore carriage returns. We'll process                    // '\n' if it comes next                }                else if (c == '\n')                {                    // Flush current line                    FlushCurrLine();                }                else if (Char.IsWhiteSpace(c))                {                    // Write single space character                    int len = _currLine.Length;                    if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))                        _currLine.Append(' ');                }                else                {                    // Add character to current line                    _currLine.Append(c);                }            }        }         // Appends the current line to output buffer        protected void FlushCurrLine()        {            // Get current line            string line = _currLine.ToString().Trim();             // Determine if line contains non-space characters            string tmp = line.Replace(" ", String.Empty);            if (tmp.Length == 0)            {                // An empty line                _emptyLines++;                if (_emptyLines < 2 && _text.Length > 0)                    _text.AppendLine(line);            }            else            {                // A non-empty line                _emptyLines = 0;                _text.AppendLine(line);            }             // Reset current line            _currLine.Length = 0;        }         ///         /// Returns the current output as a string.        ///         public override string ToString()        {            if (_currLine.Length > 0)                FlushCurrLine();            return _text.ToString();        }    }}
  • 相關文章

    聯繫我們

    該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

    如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.