網路蜘蛛(網路爬蟲)核心C#原始碼

來源:互聯網
上載者:User
網路蜘蛛或爬蟲需要能夠下載網頁、圖片(流)以及登入的Cookies等資訊,以下的C#代碼是比較實用的核心程式。using System;using System.Collections.Generic;using System.Text;using System.IO;using System.IO.Compression;using System.Xml;using System.Web;using System.Collections;using System.Runtime.InteropServices;using System.Net;using System.Net.Security;using System.Security.Authentication;using System.Security.Cryptography.X509Certificates;namespace Common{/// <summary>/// 準備POST/// </summary>/// <param name="httpRequest"></param>public delegate void OnGetPostReady(HttpWebRequest httpRequest);/// <summary>/// 準備取回應/// </summary>/// <param name="httpRequest"></param>public delegate void OnGetResponseReady(HttpWebRequest httpRequest);public class HttpWebHelper{protected HttpWebRequest httpRequest;protected HttpWebResponse httpResponse;protected CookieContainer cookieContainer;protected CredentialCache credentialCache;protected bool certificatedMode = false;protected string certFilepath = string.Empty;public OnGetPostReady OnGetPostReadyHandler = null;public OnGetPostReady OnGetResponseReadyHandler = null;protected readonly int DEFAULT_BUFFER_SIZE = 4096;public WebProxy webProxySrv = null;private static readonly int MyConnectionLimit = 300;public bool CheckGotoRecv{get;set;}public bool DoBetIsGotoRecv{get;set;}public bool LastAccessError{private set;get;}/// <summary>/// 當前自動轉向後的url/// </summary>public string CurrentUrl{private set;get;}public string CurrentLocation{private set;get;}public string CurSetCookie{set;get;}public string CurSetCookie2{set;get;}/// <summary>/// 預設構造器/// </summary>public HttpWebHelper(){this.cookieContainer = new CookieContainer();ServicePointManager.DefaultConnectionLimit = MyConnectionLimit;ServicePointManager.Expect100Continue = false;ServicePointManager.MaxServicePointIdleTime = 10000;}/// <summary>/// 代理參數構造器/// </summary>/// <param name="wp"></param>public HttpWebHelper(WebProxy wp) : this(){this.webProxySrv = wp;}/// <summary>/// 需要基本認證的構造器/// </summary>/// <param name="cred"></param>public HttpWebHelper(bool cred): this(){this.certificatedMode = cred;}public HttpWebHelper(bool cred, WebProxy wp): this(){this.certificatedMode = cred;this.webProxySrv = wp;}/// <summary>/// 基本認證和認證,refer頁面/// </summary>/// <param name="cred"></param>/// <param name="certFilepath"></param>public HttpWebHelper(bool cred, string certFilepath): this(cred){this.certFilepath = certFilepath;}public HttpWebHelper(bool cred, WebProxy wp, string certFilepath): this(cred, wp){this.certFilepath = certFilepath;}/// <summary>/// 提供批量使用者名稱和密碼的構造器/// </summary>/// <param name="uri"></param>/// <param name="method"></param>/// <param name="username"></param>/// <param name="password"></param>public HttpWebHelper(string uri, string method, string username, string password): this(true){this.credentialCache = new CredentialCache();this.credentialCache.Add(new Uri(uri), method, new NetworkCredential(username, password));}/// <summary>/// 安全詢問回呼函數,直接同意/// </summary>/// <param name="sender"></param>/// <param name="certificate"></param>/// <param name="chain"></param>/// <param name="errors"></param>/// <returns></returns>public bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors){return true;}private void SetHttpRequestOptions_Accept(string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm, string httpAccept){this.SetHttpRequestOptions(url, method, cc, referUrl, nocache, dm);this.httpRequest.Accept = httpAccept;}/// <summary>/// 設定HttpWebRequest對象/// </summary>/// <param name="url"></param>/// <param name="method"></param>/// <param name="cc"></param>/// <param name="referUrl"></param>/// <param name="nocache"></param>/// <param name="dm"></param>private void SetHttpRequestOptions(string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm){httpRequest = (HttpWebRequest)HttpWebRequest.Create(url);httpRequest.UnsafeAuthenticatedConnectionSharing = true;httpRequest.ServicePoint.ConnectionLimit = MyConnectionLimit;if (null != this.webProxySrv) httpRequest.Proxy = this.webProxySrv;if (this.certificatedMode && url.ToLower().Substring(0, 5).Equals("https")){ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(CheckValidationResult);if (null == this.credentialCache)httpRequest.UseDefaultCredentials = true;elsehttpRequest.Credentials = this.credentialCache;if (!string.IsNullOrEmpty(this.certFilepath))httpRequest.ClientCertificates.Add(X509Certificate.CreateFromCertFile(this.certFilepath));}httpRequest.CookieContainer = this.cookieContainer;if (!string.IsNullOrEmpty(referUrl)) httpRequest.Referer = referUrl;httpRequest.AutomaticDecompression = dm;httpRequest.ServicePoint.Expect100Continue = false;httpRequest.ServicePoint.UseNagleAlgorithm = false;httpRequest.ContentType = "application/x-www-form-urlencoded";// httpRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, */*";// httpRequest.AllowWriteStreamBuffering = true; 預設值就是true// httpRequest.AllowAutoRedirect = true; 預設值就是truehttpRequest.Method = method;httpRequest.Timeout = ApplicationConfig.HTTP_REQUEST_TIMEOUT;// 讀寫超時//httpRequest.ReadWriteTimeout = ApplicationConfig.HTTP_REQUEST_TIMEOUT;// httpRequest.MaximumAutomaticRedirections = 50; 預設值就是50httpRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";httpRequest.Headers.Add("Accept-Language", "zh-cn");httpRequest.Headers.Add("UA-CPU", "x86");//httpRequest.Headers.Add("Accept-Encoding", "gzip, deflate"); if (nocache){httpRequest.Headers.Add("Cache-Control", "no-cache");//httpRequest.Headers.Add("Pragma", "no-cache");}if (null != cc) httpRequest.CookieContainer.Add(cc);// 回調發起請求前事件if(null != this.OnGetPostReadyHandler){try{this.OnGetPostReadyHandler(this.httpRequest);//BaseDebug.DebugPrint("KeepAlive = " + this.httpRequest.KeepAlive.ToString());}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint(ex.ToString());}}}private void SetHttpRequestOptions(string url, string method, CookieCollection cc, string referUrl, string httpAccept){this.SetHttpRequestOptions_Accept(url, method, cc, referUrl, false, DecompressionMethods.GZip | DecompressionMethods.Deflate, httpAccept);}/// <summary>/// 重新設定某些成員/// </summary>private void ManualResetMember(){ this.cookieContainer = httpRequest.CookieContainer;this.CurrentUrl = httpRequest.Address.OriginalString;this.CurrentLocation = httpResponse.Headers["Location"];}public MemoryStream GetMemoryStream(string url, string method, CookieCollection cc, string referUrl, string httpAccept){MemoryStream ms = new MemoryStream();try{this.SetHttpRequestOptions(url, method, cc, referUrl, "*/*");this.httpRequest.Accept = httpAccept;this.httpResponse = (HttpWebResponse)httpRequest.GetResponse();// 是否收到響應if (!this.httpRequest.HaveResponse){this.httpResponse.Close();this.httpRequest.Abort();return ms;}this.ManualResetMember();if (null != this.OnGetResponseReadyHandler){try{this.OnGetResponseReadyHandler(this.httpRequest);}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint(ex.ToString());}}this.DoBetIsGotoRecv = true;Stream sm = httpResponse.GetResponseStream();if (null != sm && sm.CanRead){BinaryReader br = new BinaryReader(sm);byte[] bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);while (null != bytes && bytes.Length != 0){ms.Write(bytes, 0, bytes.Length);bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);}br.Close();}if (httpResponse.Headers["Set-Cookie"] != null)this.CurSetCookie = httpResponse.Headers["Set-Cookie"].ToString();httpResponse.Close();if (null != sm) sm.Close();// 非常重要,回到開頭ms.Seek(0, SeekOrigin.Begin);}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint("異常網址:" + url);BaseDebug.DebugPrint(ex.ToString());if (null != httpRequest) httpRequest.Abort();}return ms;}public MemoryStream SimpleGetMemoryStream(string url, string method){return this.GetMemoryStream(url, method, null, null, "text/html");}public MemoryStream SimpleGetMemoryStream(string url, string method, string httpAccept){return this.GetMemoryStream(url, method, null, null, httpAccept);}/// <summary>/// 僅僅發送請求,返回所有的輸出文本/// </summary>/// <param name="url"></param>/// <param name="method"></param>/// <param name="coding"></param>/// <param name="cc"></param>/// <param name="referUrl"></param>/// <returns></returns>public string SimpleDoPostWrapper(string url, string method, Encoding coding, CookieCollection cc, string referUrl){string str = string.Empty;StreamReader sr = null;MemoryStream sm = null;if (null == coding){sm = this.GetMemoryStream(url, method, cc, referUrl, "text/html");sr = new StreamReader(sm);}else{sm = this.GetMemoryStream(url, method, cc, referUrl, "text/html");sr = new StreamReader(sm, coding);}str = sr.ReadToEnd();sr.Close();sm.Close();return str;}public string SimpleDoPostWrapper(string url, string method){return this.SimpleDoPostWrapper(url, method, null, null, null);}public string SimpleDoPostWrapper(string url, string method, CookieCollection cc){return this.SimpleDoPostWrapper(url, method, null, cc, null);}public string SimpleDoPostWrapper(string url, string method, string referUrl){return this.SimpleDoPostWrapper(url, method, null, null, referUrl);}/// <summary>/// 上送資料,返回輸出資料流/// </summary>/// <param name="url"></param>/// <param name="data"></param>/// <param name="method"></param>/// <param name="coding"></param>/// <param name="cc"></param>/// <param name="referUrl"></param>/// <returns></returns>public MemoryStream GetMemoryStream(string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl){MemoryStream ms = new MemoryStream();try{this.SetHttpRequestOptions(url, method, cc, referUrl, "text/html");byte[] bytesData = coding.GetBytes(data);Stream requestStream = httpRequest.GetRequestStream();requestStream.Write(bytesData, 0, bytesData.Length);requestStream.Flush();requestStream.Close();this.httpResponse = (HttpWebResponse)httpRequest.GetResponse();// 是否收到響應if (!this.httpRequest.HaveResponse){this.httpResponse.Close();this.httpRequest.Abort();return ms;}this.ManualResetMember();if (null != this.OnGetResponseReadyHandler){try{this.OnGetResponseReadyHandler(this.httpRequest);}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint(ex.ToString());}}this.DoBetIsGotoRecv = true;Stream sm = httpResponse.GetResponseStream();if (null != sm && sm.CanRead){BinaryReader br = new BinaryReader(sm);byte[] bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);while (null != bytes && bytes.Length != 0){ms.Write(bytes, 0, bytes.Length);bytes = br.ReadBytes(DEFAULT_BUFFER_SIZE);}br.Close();}if (httpResponse.Headers["Set-Cookie"] != null)this.CurSetCookie = httpResponse.Headers["Set-Cookie"].ToString();httpResponse.Close();if (null != sm) sm.Close();// 非常重要,回到開頭ms.Seek(0, SeekOrigin.Begin);}catch (System.Exception ex){this.LastAccessError = true;BaseDebug.DebugPrint("異常網址:" + url);BaseDebug.DebugPrint(ex.ToString());if (null != httpRequest) httpRequest.Abort();}return ms;}public MemoryStream SimpleGetMemoryStream(string url, string data, string method, Encoding coding){return this.GetMemoryStream(url, data, method, coding, null, null);}public MemoryStream SimpleGetMemoryStream(string url, string data, string method, Encoding coding, string referUrl){return this.GetMemoryStream(url, data, method, coding, null, referUrl);}/// <summary>/// 上送,返回所有的輸出文本/// </summary>/// <param name="url"></param>/// <param name="data"></param>/// <param name="method"></param>/// <param name="coding"></param>/// <param name="referUrl"></param>/// <returns></returns>public string DoPostWrapper(string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl){string str = string.Empty;MemoryStream sm = this.GetMemoryStream(url, data, method, coding, cc, referUrl);StreamReader sr = new StreamReader(sm);str = sr.ReadToEnd();sr.Close();sm.Close();return str;}public string DoPostWrapper(string url, string data, string method, Encoding coding){return this.DoPostWrapper(url, data, method, coding, null, null);}public string DoPostWrapper(string url, string data, string method, Encoding coding, CookieCollection cc){return this.DoPostWrapper(url, data, method, coding, cc, null);}public string DoPostWrapper(string url, string data, string method, Encoding coding, string referUrl){return this.DoPostWrapper(url, data, method, coding, null, referUrl);}/// <summary>/// 上送,返回所有的輸出文本,參數是字典/// </summary>/// <param name="url"></param>/// <param name="dicArguments"></param>/// <param name="method"></param>/// <param name="coding"></param>/// <param name="referUrl"></param>/// <returns></returns>public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding, CookieCollection cc, string referUrl){string data = this.BuildRequestArguments(dicArguments);return this.DoPostWrapper(url, data, method, coding, cc, referUrl);}public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding){return this.DoPostWrapper(url, dicArguments, method, coding, null, null);}public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding, CookieCollection cc){return this.DoPostWrapper(url, dicArguments, method, coding, cc, null);}public string DoPostWrapper(string url, Dictionary<string, string> dicArguments, string method, Encoding coding, string referUrl){return this.DoPostWrapper(url, dicArguments, method, coding, null, referUrl);}/// <summary>/// 下載驗證碼,只返回記憶體流,調用函數要負責關閉該Stream/// </summary>/// <param name="url"></param>/// <param name="method"></param>/// <returns></returns>public MemoryStream DownloadStream(string url, string method){return this.SimpleGetMemoryStream(url, method, "*/*");}/// <summary>/// 從字典中產生上傳參數.提供編碼定製支援/// </summary>/// <param name="dicArguments"></param>/// <param name="coding"></param>/// <returns></returns>public string BuildRequestArguments(Dictionary<string, string> dicArguments, Encoding coding){StringBuilder sb = new StringBuilder();string str = string.Empty;if (0 == dicArguments.Count) return str;foreach (KeyValuePair<string, string> kvp in dicArguments){if(null != coding)sb.Append(HttpUtility.UrlEncode(kvp.Key, coding) + "=" + HttpUtility.UrlEncode(kvp.Value, coding));elsesb.Append(HttpUtility.UrlEncode(kvp.Key) + "=" + HttpUtility.UrlEncode(kvp.Value));// a&bsb.Append("&");}str = sb.ToString();return str.Substring(0, str.Length - 1);}/// <summary>/// 從字典中產生上傳的預設參數,不提供編碼定製支援/// </summary>/// <param name="dicArguments"></param>/// <returns></returns>public string BuildRequestArguments(Dictionary<string, string> dicArguments){return this.BuildRequestArguments(dicArguments, null);}/// <summary>/// 查詢cookie中的某個項的值/// </summary>/// <param name="key"></param>/// <param name="domain"></param>/// <returns></returns>public string GetCookieValue(string key, string domain) {if (0 == this.cookieContainer.Count){return string.Empty;}CookieCollection cc = this.cookieContainer.GetCookies(new Uri(domain));return cc[key].Value;}/// <summary>/// 設定cookies容器/// </summary>/// <param name="cc"></param>public void SetCookieContainer(CookieContainer cc) {this.cookieContainer = cc;}/// <summary>/// 放棄請求/// </summary>public bool AbortHttpRequest(){if(null != this.httpRequest){this.httpRequest.Abort();}return this.CheckGotoRecv && this.DoBetIsGotoRecv;}}}
相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.