一個小需求,擷取遠程頁面的源碼,主要用於抓資料。原來用的好好的,最近突然不能擷取頁面源碼了,但是仍然可以用瀏覽器正常瀏覽。(文後附源碼下載。^_^)
經過分析,原來用的代碼如下:
StreamReader sreader = null ;<br /> string result = string .Empty;<br /> try<br /> {<br /> HttpWebRequest httpWebRequest = (HttpWebRequest)WebRequest.Create(Url);<br /> // httpWebRequest.Timeout = 20;<br /> httpWebRequest.KeepAlive = false ;<br /> #endregion<br /> HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();<br /> if (httpWebResponse.StatusCode == HttpStatusCode.OK)<br /> {<br /> sreader = new StreamReader(httpWebResponse.GetResponseStream(), encoding);<br /> result = reader.ReadToEnd();<br /> if ( null != httpWebResponse) { httpWebResponse.Close(); }<br /> return result;<br /> }<br /> return result; ;<br /> }<br /> catch (WebException e) { return null ; }<br /> finally { if (sreader != null ) { sreader.Close(); } }
查了下資料,原來需要加參數。
#region 關鍵參數,否則會取不到內容 Important Parameters,else get nothing.<br /> httpWebRequest.UserAgent = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)";<br /> httpWebRequest.Accept = "*/*";<br /> httpWebRequest.KeepAlive = true;<br /> httpWebRequest.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");<br /> #endregion
修正後的代碼如下:
#region 讀取頁面詳細資料<br /> /**/ /// <summary> /// 讀取頁面詳細資料<br /> /// </summary><br /> /// <param name="Url"> 需要讀取的地址 </param><br /> /// <param name="encoding"> 讀取的編碼方式 </param><br /> /// <returns></returns><br /> public static string GetStringByUrl( string Url, System.Text.Encoding encoding)<br /> {<br /> if (Url.Equals( " about:blank " )) return null ; ;<br /> if ( ! Url.StartsWith( " http:// " ) && ! Url.StartsWith( " https:// " )) { Url = " http:// " + Url; }<br /> int dialCount = 0 ;<br /> loop:<br /> StreamReader sreader = null ;<br /> string result = string .Empty;<br /> try<br /> {<br /> HttpWebRequest httpWebRequest = (HttpWebRequest)WebRequest.Create(Url);<br /> // httpWebRequest.Timeout = 20;<br /> 關鍵參數,否則會取不到內容 Important Parameters,else get nothing. #region 關鍵參數,否則會取不到內容 Important Parameters,else get nothing.<br /> httpWebRequest.UserAgent = " User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;) " ;<br /> httpWebRequest.Accept = " */* " ;<br /> httpWebRequest.KeepAlive = true ;<br /> httpWebRequest.Headers.Add( " Accept-Language " , " zh-cn,en-us;q=0.5 " );<br /> #endregion<br /> HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();<br /> if (httpWebResponse.StatusCode == HttpStatusCode.OK)<br /> {<br /> sreader = new StreamReader(httpWebResponse.GetResponseStream(), encoding);<br /> char [] cCont = new char [ 256 ];<br /> int count = sreader.Read(cCont, 0 , 256 );<br /> while (count > 0 )<br /> { // Dumps the 256 characters on a string and displays the string to the console.<br /> String str = new String(cCont, 0 , count);<br /> result += str;<br /> count = sreader.Read(cCont, 0 , 256 );<br /> }<br /> }<br /> if ( null != httpWebResponse) { httpWebResponse.Close(); }<br /> return result;<br /> }<br /> catch (WebException e)<br /> {<br /> if (e.Status == WebExceptionStatus.ConnectFailure) { dialCount ++ ; ReDial(); }<br /> if (dialCount < 5 ) { goto loop; }<br /> return null ;<br /> }<br /> finally { if (sreader != null ) { sreader.Close(); } }<br /> }<br /> #endregion<br /> public static void ReDial()<br /> {<br /> int res = 1 ;<br /> /**/ /// /while (res != 0)<br /> /// /{<br /> /// / CSDNWebTest.RASDisplay ras = new RASDisplay();<br /> /// / ras.Disconnect();<br /> /// / res = ras.Connect("asdl");<br /> /// / System.Threading.Thread.Sleep(TimeSpan.FromSeconds(10));<br /> /// /}<br /> }
問題是解決了,後來再想了想,可以用WebClient先把頁面download到本地臨時檔案,再讀取常值內容。
代碼如下:
private string GetPageByWebClient( string url)<br /> {<br /> string result = null ;<br /> if (url.Equals( " about:blank " )) return null ; ;<br /> if ( ! url.StartsWith( " http:// " ) && ! url.StartsWith( " https:// " )) { url = " http:// " + url; }<br /> string filename = RandomKey( 1111 , 9999 ) + " .txt " ;<br /> DownloadOneFileByURLWithWebClient(filename, url, " C:// " );<br /> StreamReader sr = new StreamReader( " c:// " + filename, System.Text.Encoding.Default);<br /> try { result = sr.ReadToEnd(); return result; }<br /> catch { return null ; }<br /> finally<br /> {<br /> if (sr != null ) { sr.Close(); }<br /> }<br /> }<br /> private string RandomKey( int b, int e)<br /> {<br /> return DateTime.Now.ToString( " yyyyMMdd-HHmmss-fff- " ) + this .getRandomID(b, e);<br /> }<br /> private int getRandomID( int minValue, int maxValue)<br /> {<br /> Random ri = new Random( unchecked (( int )DateTime.Now.Ticks));<br /> int k = ri.Next(minValue, maxValue);<br /> return k;<br /> }<br /> private string GuidString<br /> {<br /> get { return Guid.NewGuid().ToString(); }<br /> }<br /> /**/ /// Web Client Method ,only For Small picture<br /> /// </summary><br /> /// <param name="fileName"></param><br /> /// <param name="url"></param><br /> /// <param name="localPath"></param><br /> public static void DownloadOneFileByURLWithWebClient( string fileName, string url, string localPath)<br /> {<br /> System.Net.WebClient wc = new System.Net.WebClient();<br /> if (File.Exists(localPath + fileName)) { File.Delete(localPath + fileName); }<br /> if (Directory.Exists(localPath) == false ) { Directory.CreateDirectory(localPath); }<br /> wc.DownloadFile(url + fileName, localPath + fileName);<br /> }
結果不能擷取源碼。錯誤如下:
再想想,還有Webbrowser控制項可以用啊。在WinFrom下只要在主線程前加[STAThread]即可。
[STAThread]<br /> public void GetURLContentByWebBrowser()<br /> {<br /> try<br /> {<br /> // webBrowser1 = new WebBrowser();<br /> string url = txtUrl.Text.Trim();<br /> string result = null ;<br /> WebBrowser wb = new WebBrowser();<br /> /**/ /// /if (wb != null){ wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted); }<br /> if (String.IsNullOrEmpty(url)) return ;<br /> if (url.Equals( " about:blank " )) return ; ;<br /> if ( ! url.StartsWith( " http:// " ) && ! url.StartsWith( " https:// " )) { url = " http:// " + url; }<br /> try<br /> {<br /> wb.Navigate( new Uri(url));<br /> result = wb.DocumentText;<br /> lbResult.Text = result;<br /> }<br /> catch (System.UriFormatException)<br /> { }<br /> return ;<br /> }<br /> catch (Exception ex)<br /> {<br /> // WriteLog.Writelog("這是擷取頁面全部html代碼時發生的錯誤:" + url, ex);<br /> throw ex;<br /> // return ;<br /> }<br /> }
在WebForm就麻煩些了,出現錯誤,線程不在單一執行緒 Apartment中,故無法執行個體化 ActiveX 控制項“8856f961-340a-11d0-a96b-00c04fd705a2”
代碼如下:
private string GetPageStringbyWebBrowser( string url)<br /> {<br /> if (url.Equals( " about:blank " )) return null ; ;<br /> if ( ! url.StartsWith( " http:// " ) && ! url.StartsWith( " https:// " )) { url = " http:// " + url; }<br /> WebBrowser myWB = new WebBrowser();<br /> myWB.ScrollBarsEnabled = false ;<br /> myWB.Navigate(url);<br /> while (myWB.ReadyState != WebBrowserReadyState.Complete)<br /> {<br /> System.Windows.Forms.Application.DoEvents();<br /> }<br /> if (myWB != null )<br /> {<br /> System.IO.StreamReader getReader = null ;<br /> try<br /> {<br /> getReader = new System.IO.StreamReader(myWB.DocumentStream, System.Text.Encoding.GetEncoding(myWB.Document.Encoding));<br /> string gethtml = getReader.ReadToEnd();<br /> return gethtml;<br /> }<br /> catch { return null ; }<br /> finally<br /> {<br /> if (getReader != null ) { getReader.Close(); }<br /> myWB.Dispose();<br /> }<br /> }<br /> return null ;<br /> }
後來搜尋N小時(N>=5)後,終於找到可行解決方案,在WebPage頁面頭部加入AspCompat="true"
即<%@ Page Language="C#" AspCompat="true"
******/>
MSDN給出的解釋是:
在 ASP .NET 網頁的 <%@Page> 標記中包含相容性屬性 aspcompat=true,如 <%@Page aspcompat=true Language=VB%>。使用此屬性將強制網頁以 STA 模式執行,從而確保您的組件可以繼續正確運行。如果試圖使用 STA 組件但沒有指定此標記,運行時將會發生異常情況。
將此屬性的值設定為 true 時,將允許網頁調用 COM+ 1.0 組件,該組件需要訪問未管理 ASP 內建對象。可以通過 ObjectContext 對象進行訪問。
如果將此標記的值設為 true,效能會稍微有些下降。建議只在確實需要時才這樣做。
終於可以了! 不知道有沒有更好的方法??
附:源碼下載。
邀月註:
如果不能測試,請注意是否在域(AD)環境下,如果是! 請注意設定代理和防火牆
請參考:
http://dev.csdn.net/article/83914.shtm
或http://blog.csdn.net/downmoon/archive/2006/04/14/663337.aspx
或http://www.cnblogs.com/downmoon/archive/2007/12/29/1019701.html