/// <Summary> /// obtain the source code /// </Summary> /// <Param name = "url"> </param> /// <returns> </returns> Public static string gethtml (string URL, encoding encoding) {httpwebrequest request = NULL; httpwebresponse response = NULL; streamreader reader = NULL; try {request = (httpwebrequest) webrequest. create (URL); Request. timeout = 20000; request. allowautoredirect = false; response = (httpwebresponse) request. g Etresponse (); If (response. statuscode = httpstatuscode. OK & response. contentlength <1024*1024) {If (response. contentencoding! = NULL & response. contentencoding. equals ("gzip", stringcomparison. invariantcultureignorecase) reader = new streamreader (New gzipstream (response. getresponsestream (), compressionmode. decompress), encoding); else reader = new streamreader (response. getresponsestream (), encoding); string html = reader. readtoend (); Return HTML ;}} catch {} finally {If (response! = NULL) {response. Close (); response = NULL;} If (reader! = NULL) reader. Close (); If (request! = NULL) Request = NULL;} return string. Empty ;}
public static string GetEncoding(string url) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 20000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)); else reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)"); if (reg_charset.IsMatch(html)) { return reg_charset.Match(html).Groups["charset"].Value; } else if (response.CharacterSet != string.Empty) { return response.CharacterSet; } else return Encoding.Default.BodyName; } } catch { } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } }
The following shows how to obtain the webpage title:
Using system; using system. net; using system. text; using system. text. regularexpressions; Class program {// obtain the HTML content of the webpage. Based on the charset of the webpage, the system automatically determines encoding static string gethtml (string URL) {return gethtml (URL, null );} // obtain the HTML content of the webpage. Specify encoding static string gethtml (string URL, encoding) {byte [] Buf = new WebClient (). downloaddata (URL); If (encoding! = NULL) return encoding. getstring (BUF); string html = encoding. utf8.getstring (BUF); encoding = getencoding (HTML); If (encoding = NULL | encoding = encoding. utf8) return HTML; return encoding. getstring (BUF);} // extract the webpage's encoding static encoding getencoding (string html) {string pattern = @"(? I) \ bcharset = (? <Charset> [-a-zA-Z_0-9] +) "; string charset = RegEx. match (HTML, pattern ). groups ["charset"]. value; try {return encoding. getencoding (charset);} catch (argumentexception) {return NULL ;}// extract the title static string gettitle (string html) of the webpage based on the HTML content of the webpage) {string pattern = @"(? Si) <title (?: \ S + (?: "" [^ ""] * "" | '[^'] * '| [^ ""'>]) *)?> (? <Title> .*?) </Title> "; return RegEx. match (HTML, pattern ). groups ["title"]. value. trim ();} // print the encoding and title static void printencodingandtitle (string URL) {string html = gethtml (URL); console. writeline ("[{0}] [{1}]", getencoding (HTML), gettitle (HTML);} // static void main () {printencodingandtitle ("http://www.msdn.net/"); printencodingandtitle ("http://www.cnblogs.com/"); printencodingandtitle ("http://www.cnblogs.com/skyiv/"); printencodingandtitle ("http://www.csdn.net /"); printencodingandtitle ("http://news.163.com/");}/* program output: [] [msdn: Microsoft Developer Network] [system. text. utf8encoding] [blog garden-programmer's online home] [system. text. utf8encoding] [Space/IV-blog park] [system. text. utf8encoding] [csdn. net-the largest IT technology community in China, providing the most comprehensive information dissemination and service platform for IT professionals] [system. text. dbcscodepageencoding] [News center_netease news] */