asp.net 利用HttpWebRequest自動擷取網頁編碼並擷取網頁原始碼

來源:互聯網
上載者:User
     /// <summary>    /// 擷取原始碼    /// </summary>    /// <param name="url"></param>    /// <returns></returns>    public static string GetHtml(string url, Encoding encoding)    {        HttpWebRequest request = null;        HttpWebResponse response = null;        StreamReader reader = null;        try        {            request = (HttpWebRequest)WebRequest.Create(url);            request.Timeout = 20000;            request.AllowAutoRedirect = false;            response = (HttpWebResponse)request.GetResponse();            if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)            {                if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))                    reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);                else                    reader = new StreamReader(response.GetResponseStream(), encoding);                string html = reader.ReadToEnd();                return html;            }        }        catch        {        }        finally        {            if (response != null)            {                response.Close();                response = null;            }            if (reader != null)                reader.Close();            if (request != null)                request = null;        }        return string.Empty;    }
    public static string GetEncoding(string url)    {        HttpWebRequest request = null;        HttpWebResponse response = null;        StreamReader reader = null;        try        {            request = (HttpWebRequest)WebRequest.Create(url);            request.Timeout = 20000;            request.AllowAutoRedirect = false;            response = (HttpWebResponse)request.GetResponse();            if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)            {                if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))                    reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));                else                    reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);                string html = reader.ReadToEnd();                Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");                if (reg_charset.IsMatch(html))                {                    return reg_charset.Match(html).Groups["charset"].Value;                }                else if (response.CharacterSet != string.Empty)                {                    return response.CharacterSet;                }                else                    return Encoding.Default.BodyName;            }        }        catch        {        }        finally        {            if (response != null)            {                response.Close();                response = null;            }            if (reader != null)                reader.Close();            if (request != null)                request = null;        }    }

下面是擷取網頁標題的

 using System;  using System.Net;  using System.Text;  using System.Text.RegularExpressions;    class Program  {    // 擷取網頁的HTML內容,根據網頁的charset自動判斷Encoding    static string GetHtml(string url)    {      return GetHtml(url, null);    }      // 擷取網頁的HTML內容,指定Encoding    static string GetHtml(string url, Encoding encoding)    {      byte[] buf = new WebClient().DownloadData(url);      if (encoding != null) return encoding.GetString(buf);      string html = Encoding.UTF8.GetString(buf);      encoding = GetEncoding(html);      if (encoding == null || encoding == Encoding.UTF8) return html;      return encoding.GetString(buf);    }      // 根據網頁的HTML內容提取網頁的Encoding    static Encoding GetEncoding(string html)    {      string pattern = @"(?i)\bcharset=(? <charset>[-a-zA-Z_0-9]+)";      string charset = Regex.Match(html, pattern).Groups["charset"].Value;      try { return Encoding.GetEncoding(charset); }      catch (ArgumentException) { return null; }    }      // 根據網頁的HTML內容提取網頁的Title    static string GetTitle(string html)    {      string pattern = @"(?si) <title(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(? <title>.*?) </title>";      return Regex.Match(html, pattern).Groups["title"].Value.Trim();    }      // 列印網頁的Encoding和Title    static void PrintEncodingAndTitle(string url)    {      string html = GetHtml(url);      Console.WriteLine("[{0}] [{1}]", GetEncoding(html), GetTitle(html));    }      // 程式入口    static void Main()    {      PrintEncodingAndTitle("http://www.msdn.net/");      PrintEncodingAndTitle("http://www.cnblogs.com/");      PrintEncodingAndTitle("http://www.cnblogs.com/skyiv/");      PrintEncodingAndTitle("http://www.csdn.net/");      PrintEncodingAndTitle("http://news.163.com/");    }  }  /* 程式輸出:  [] [MSDN: Microsoft Developer Network]  [System.Text.UTF8Encoding] [部落格園 - 程式員的網上家園]  [System.Text.UTF8Encoding] [空間/IV - 部落格園]  [System.Text.UTF8Encoding] [CSDN.NET - 中國最大的IT技術社區,為IT專業技術人員提供最全面的資訊傳播和服務平台]  [System.Text.DBCSCodePageEncoding] [新聞中心_網易新聞]  */

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.