Copy Code code as follows:
Using System;
Using System.Net;
Using System.Text;
Using System.Text.RegularExpressions;
Class Program
{
Get the HTML content of the Web page and automatically judge the encoding based on the charset of the page
static string gethtml (string url)
{
Return gethtml (URL, null);
}
Gets the HTML content of the Web page, specifying encoding
static string gethtml (string url, Encoding Encoding)
{
byte[] buf = new WebClient (). Downloaddata (URL);
if (encoding!= null) return encoding. GetString (BUF);
String html = Encoding.UTF8.GetString (BUF);
encoding = getencoding (HTML);
if (encoding = NULL | | | encoding = = ENCODING.UTF8) return HTML;
return encoding. GetString (BUF);
}
Extract the encoding of a Web page based on its HTML content
Static Encoding getencoding (string html)
{
String pattern = @ "(? i) \bcharset= (? <charset>[-a-za-z_0-9]+)";
String charset = Regex.match (HTML, pattern). Groups["CharSet"]. Value;
try {return encoding.getencoding (charset);}
catch (ArgumentException) {return null;}
}
Program entry
static void Main ()
{
Console.WriteLine (gethtml (http://www.jb51.net));
Console.read ();
}
}