Web Spider (Web Crawler) Core C # source code
Last Update:2018-12-07
Source: Internet
Author: User
Web Crawlers or crawlers need to be able to download information such as webpages, images (streams), and logon Cookies. The following C # code is a useful core program. Using System; using System. collections. generic; using System. text; using System. IO; using System. IO. compression; using System. xml; using System. web; using System. collections; using System. runtime. interopServices; using System. net; using System. net. security; using System. security. authentication; using System. security. cryptography. x509Certificates; namespace Common {// <summary> /// prepare POST /// </summary> /// <param Name = "httpRequest"> </param> public delegate void OnGetPostReady (HttpWebRequest httpRequest ); /// <summary> /// prepare for response /// </summary> /// <param name = "httpRequest"> </param> public delegate void OnGetResponseReady (HttpWebRequest httpRequest ); public class HttpWebHelper {protected HttpWebRequest httpRequest; protected HttpWebResponse httpResponse; protected CookieContainer cookieContainer; protected CredentialCa Che credentialCache; protected bool certificatedMode = false; protected string certFilepath = string. empty; public OnGetPostReady Limit = null; protected readonly int DEFAULT_BUFFER_SIZE = 4096; public WebProxy webProxySrv = null; private static readonly int MyConnectionLimit = 300; public bool CheckGotoRecv {get; set;} public bool DoBet IsGotoRecv {get; set;} public bool LastAccessError {private set; get ;} /// <summary> /// url after the current automatic redirection /// </summary> public string CurrentUrl {private set; get;} public string CurrentLocation {private set; get;} public string CurSetCookie {set; get;} public string CurSetCookie2 {set; get ;} /// <summary> /// default constructor /// </summary> public HttpWebHelper () {this. cookieContainer = new CookieContainer (); ServicePointManager. default ConnectionLimit = MyConnectionLimit; ServicePointManager. expect100Continue = false; ServicePointManager. maxServicePointIdleTime = 10000 ;} /// <summary> /// Proxy component number maker /// </summary> /// <param name = "wp"> </param> public HttpWebHelper (webProxy wp): this () {this. webProxySrv = wp ;} /// <summary> /// the constructor that requires basic authentication /// </summary> /// <param name = "cred"> </param> public HttpWebHelper (bool cred): this () {this. certificatedMod E = cred;} public HttpWebHelper (bool cred, WebProxy wp): this () {this. certificatedMode = cred; this. webProxySrv = wp ;}/// <summary> /// Basic Authentication and certificate, refer page // </summary> /// <param name = "cred"> </param> /// <param name = "certFilepath"> </param> public httpWebHelper (bool cred, string certFilepath): this (cred) {this. certFilepath = certFilepath;} public HttpWebHelper (bool cred, WebProxy wp, string certFilepath): thi S (cred, wp) {this. certFilepath = certFilepath ;} /// <summary> /// the constructor that provides the batch user name and password /// </summary> /// <param name = "uri"> </param>/ // <param name = "method"> </param> /// <param name = "username"> </param> /// <param name = "password"> </param> public HttpWebHelper (string uri, string method, string username, string password): this (true) {this. credentialCache = new CredentialCache (); this. credentialCache. add (new Uri (uri), Method, new NetworkCredential (username, password) ;}/// <summary> /// Security query callback function, directly agree /// </summary> /// <param name = "sender"> </param> /// <param name = "certificate"> </param>/ // <param name = "chain"> </param> // <param name = "errors"> </param> // <returns> </returns> public bool CheckValidationResult (object sender, x509Certificate certificate, X509Chain chain, SslPolicyErrors errors) {return true;} private voi D SetHttpRequestOptions_Accept (string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm, string httpAccept) {this. setHttpRequestOptions (url, method, cc, referUrl, nocache, dm); this. httpRequest. accept = httpAccept ;} /// <summary> /// set the HttpWebRequest object /// </summary> /// <param name = "url"> </param> /// <param name = "method"> </param> // <param name = "cc"> </param> /// <Param name = "referUrl"> </param> /// <param name = "nocache"> </param> /// <param name = "dm"> </param> private void SetHttpRequestOptions (string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm) {httpRequest = (HttpWebRequest) HttpWebRequest. create (url); httpRequest. unsafeAuthenticatedConnectionSharing = true; httpRequest. servicePoint. connectionLimit = MyConne CtionLimit; if (null! = This. webProxySrv) httpRequest. proxy = this. webProxySrv; if (this. certificatedMode & url. toLower (). substring (0, 5 ). equals ("https") {ServicePointManager. serverCertificateValidationCallback = new System. net. security. remoteCertificateValidationCallback (CheckValidationResult); if (null = this. credentialCache) httpRequest. usedefacrecredentials = true; elsehttpRequest. credentials = this. credentialCache; if (! String. encode (this. certFilepath) httpRequest. ClientCertificates. Add (X509Certificate. CreateFromCertFile (this. certFilepath);} httpRequest. CookieContainer = this. cookieContainer; if (! String. isNullOrEmpty (referUrl) httpRequest. referer = referUrl; httpRequest. automaticDecompression = dm; httpRequest. servicePoint. expect100Continue = false; httpRequest. servicePoint. useNagleAlgorithm = false; httpRequest. contentType = "application/x-www-form-urlencoded"; // httpRequest. accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, */*"; // httpRequest. allow WriteStreamBuffering = true; the default value is true // httpRequest. allowAutoRedirect = true; the default value is truehttpRequest. method = method; httpRequest. timeout = ApplicationConfig. HTTP_REQUEST_TIMEOUT; // when the timeout period is exceeded // httpRequest. readWriteTimeout = ApplicationConfig. HTTP_REQUEST_TIMEOUT; // httpRequest. maximumAutomaticRedirections = 50; the default value is 50httpRequest. userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2 ;. net clr 1. 1.4322 ;. net clr 2.0.50727) "; httpRequest. headers. add ("Accept-Language", "zh-cn"); httpRequest. headers. add ("UA-CPU", "x86"); // httpRequest. headers. add ("Accept-Encoding", "gzip, deflate"); if (nocache) {httpRequest. headers. add ("Cache-Control", "no-cache"); // httpRequest. headers. add ("Pragma", "no-cache");} if (null! = Cc) httpRequest. CookieContainer. Add (cc); // callback the event if (null! = This. onGetPostReadyHandler) {try {this. onGetPostReadyHandler (this. httpRequest); // BaseDebug. debuuplint ("KeepAlive =" + this. httpRequest. keepAlive. toString ();} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint (ex. toString () ;}} private void SetHttpRequestOptions (string url, string method, CookieCollection cc, string referUrl, string httpAccept) {this. setHttpRequestOptions_A Ccept (url, method, cc, referUrl, false, DecompressionMethods. GZip | DecompressionMethods. deflate, httpAccept);} // <summary> // reset some members // </summary> private void ManualResetMember () {this. cookieContainer = httpRequest. cookieContainer; this. currentUrl = httpRequest. address. originalString; this. currentLocation = httpResponse. headers ["Location"];} public MemoryStream GetMemoryStream (string url, string Method, CookieCollection cc, string referUrl, string httpAccept) {MemoryStream MS = new MemoryStream (); try {this. setHttpRequestOptions (url, method, cc, referUrl, "*/*"); this. httpRequest. accept = httpAccept; this. httpResponse = (HttpWebResponse) httpRequest. getResponse (); // whether to receive the response if (! This. httpRequest. HaveResponse) {this. httpResponse. Close (); this. httpRequest. Abort (); return MS;} this. ManualResetMember (); if (null! = This. onGetResponseReadyHandler) {try {this. onGetResponseReadyHandler (this. httpRequest);} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint (ex. toString () ;}} this. doBetIsGotoRecv = true; Stream sm = httpResponse. getResponseStream (); if (null! = Sm & sm. CanRead) {BinaryReader br = new BinaryReader (sm); byte [] bytes = br. ReadBytes (DEFAULT_BUFFER_SIZE); while (null! = Bytes & bytes. Length! = 0) {ms. write (bytes, 0, bytes. length); bytes = br. readBytes (DEFAULT_BUFFER_SIZE);} br. close ();} if (httpResponse. headers ["Set-Cookie"]! = Null) this. CurSetCookie = httpResponse. Headers ["Set-Cookie"]. ToString (); httpResponse. Close (); if (null! = Sm) sm. close (); // very important. Return to the beginning of ms. seek (0, SeekOrigin. begin);} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint ("constant website address:" + url); BaseDebug. debuuplint (ex. toString (); if (null! = HttpRequest) httpRequest. abort ();} return MS;} public MemoryStream SimpleGetMemoryStream (string url, string method) {return this. getMemoryStream (url, method, null, null, "text/html");} public MemoryStream SimpleGetMemoryStream (string url, string method, string httpAccept) {return this. getMemoryStream (url, method, null, null, httpAccept);} // <summary> // send requests only, return all output texts /// </summary> /// <param name =" Url "> </param> // <param name =" method "> </param> // <param name =" coding "> </param> /// <param name = "cc"> </param> /// <param name = "referUrl"> </param> /// <returns> </returns> public string SimpleDoPostWrapper (string url, string method, Encoding coding, CookieCollection cc, string referUrl) {string str = string. empty; StreamReader sr = null; MemoryStream sm = null; if (null = coding) {sm = this. getMemoryStream (Url, method, cc, referUrl, "text/html"); sr = new StreamReader (sm);} else {sm = this. getMemoryStream (url, method, cc, referUrl, "text/html"); sr = new StreamReader (sm, coding);} str = sr. readToEnd (); sr. close (); sm. close (); return str;} public string SimpleDoPostWrapper (string url, string method) {return this. simpleDoPostWrapper (url, method, null);} public string SimpleDoPostWrapper (string url, s Tring method, CookieCollection cc) {return this. simpleDoPostWrapper (url, method, null, cc, null);} public string SimpleDoPostWrapper (string url, string method, string referUrl) {return this. simpleDoPostWrapper (url, method, null, null, referUrl) ;}/// <summary> // send data, return output stream // </summary> /// <param name = "url"> </param> // <param name = "data"> </param> /// <param name = "method"> </param> /// <param name = "codin G "> </param> /// <param name =" cc "> </param> // <param name =" referUrl "> </param> /// <returns> </returns> public MemoryStream GetMemoryStream (string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl) {MemoryStream MS = new MemoryStream (); try {this. setHttpRequestOptions (url, method, cc, referUrl, "text/html"); byte [] bytesData = coding. getBytes (data); Stream requestStream = HttpRequest. getRequestStream (); requestStream. write (bytesData, 0, bytesData. length); requestStream. flush (); requestStream. close (); this. httpResponse = (HttpWebResponse) httpRequest. getResponse (); // whether to receive the response if (! This. httpRequest. HaveResponse) {this. httpResponse. Close (); this. httpRequest. Abort (); return MS;} this. ManualResetMember (); if (null! = This. on getresponsereadyhandler) {try {this. onGetResponseReadyHandler (this. httpRequest);} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint (ex. toString () ;}} this. doBetIsGotoRecv = true; Stream sm = httpResponse. getResponseStream (); if (null! = Sm & sm. CanRead) {BinaryReader br = new BinaryReader (sm); byte [] bytes = br. ReadBytes (DEFAULT_BUFFER_SIZE); while (null! = Bytes & bytes. Length! = 0) {ms. write (bytes, 0, bytes. length); bytes = br. readBytes (DEFAULT_BUFFER_SIZE);} br. close ();} if (httpResponse. headers ["Set-Cookie"]! = Null) this. CurSetCookie = httpResponse. Headers ["Set-Cookie"]. ToString (); httpResponse. Close (); if (null! = Sm) sm. close (); // very important. Return to the beginning of ms. seek (0, SeekOrigin. begin);} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint ("constant website address:" + url); BaseDebug. debuuplint (ex. toString (); if (null! = HttpRequest) httpRequest. abort ();} return MS;} public MemoryStream SimpleGetMemoryStream (string url, string data, string method, Encoding coding) {return this. getMemoryStream (url, data, method, coding, null, null);} public MemoryStream SimpleGetMemoryStream (string url, string data, string method, Encoding coding, string referUrl) {return this. getMemoryStream (url, data, method, coding, null, referUrl );} /// <Summary> // forward, return all output texts /// </summary> /// <param name = "url"> </param> /// <param name = "data"> </ param> /// <param name = "method"> </param> /// <param name = "coding"> </param> /// <param name =" referUrl "> </param> // <returns> </returns> public string DoPostWrapper (string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl) {string str = string. empty; MemoryStream sm = this. GetMemoryStream (url, data, method, coding, cc, referUrl); StreamReader sr = new StreamReader (sm); str = sr. readToEnd (); sr. close (); sm. close (); return str;} public string DoPostWrapper (string url, string data, string method, Encoding coding) {return this. doPostWrapper (url, data, method, coding, null, null);} public string DoPostWrapper (string url, string data, string method, Encoding coding, CookieCollec Tion cc) {return this. doPostWrapper (url, data, method, coding, cc, null);} public string DoPostWrapper (string url, string data, string method, Encoding coding, string referUrl) {return this. doPostWrapper (url, data, method, coding, null, referUrl) ;}/// <summary> /// returns all output texts, the parameter is a dictionary // </summary> /// <param name = "url"> </param> // <param name = "dicArguments"> </param> /// <param name = "method"> </param> /// <Param name = "coding"> </param> /// <param name = "referUrl"> </param> /// <returns> </returns> public string DoPostWrapper (string url, dictionary <string, string> dicArguments, string method, Encoding coding, CookieCollection cc, string referUrl) {string data = this. buildRequestArguments (dicArguments); return this. doPostWrapper (url, data, method, coding, cc, referUrl);} public string DoPostWrapper (string url, Dictionary <string, string> dicArguments, string method, Encoding coding) {return this. doPostWrapper (url, dicArguments, method, coding, null, null);} public string DoPostWrapper (string url, Dictionary <string, string> dicArguments, string method, Encoding coding, CookieCollection cc) {return this. doPostWrapper (url, dicArguments, method, coding, cc, null);} public string DoPostWrapper (string url, Dictio Nary <string, string> dicArguments, string method, Encoding coding, string referUrl) {return this. doPostWrapper (url, dicArguments, method, coding, null, referUrl);} // <summary> // download the verification code and only return the memory stream, the calling function is responsible for disabling the Stream // </summary> /// <param name = "url"> </param> /// <param name = "method"> </param> /// <returns> </returns> public MemoryStream DownloadStream (string url, string method) {return this. simpleGetMemoryStr Eam (url, method, "*/*");} // <summary> // generate upload parameters from the dictionary. supported encoding customization /// </summary> /// <param name = "dicArguments"> </param> /// <param name = "coding"> </param >/// <returns> </returns> public string BuildRequestArguments (Dictionary <string, string> dicArguments, Encoding coding) {StringBuilder sb = new StringBuilder (); string str = string. empty; if (0 = dicArguments. count) return str; foreach (KeyValuePair <string, s Tring> kvp in dicArguments) {if (null! = Coding) sb. append (HttpUtility. urlEncode (kvp. key, coding) + "=" + HttpUtility. urlEncode (kvp. value, coding); elsesb. append (HttpUtility. urlEncode (kvp. key) + "=" + HttpUtility. urlEncode (kvp. value); // a & bsb. append ("&");} str = sb. toString (); return str. substring (0, str. length-1);} // <summary> // generate the default upload parameter from the dictionary, customization of codes is not supported /// </summary> /// <param name = "dicArguments"> </param> /// <returns> </returns> public String BuildRequestArguments (Dictionary <string, string> dicArguments) {return this. buildRequestArguments (dicArguments, null );} /// <summary> /// query the value of an item in the cookie /// </summary> /// <param name = "key"> </param> // /<param name = "domain"> </param> // <returns> </returns> public string GetCookieValue (string key, string domain) {if (0 = this. cookieContainer. count) {return string. empty;} CookieCollection cc = this. c OokieContainer. getCookies (new Uri (domain); return cc [key]. value ;} /// <summary> /// set the cookie container // </summary> /// <param name = "cc"> </param> public void SetCookieContainer (CookieContainer cc)) {this. cookieContainer = cc;} // <summary> /// for more information, /// </summary> public bool AbortHttpRequest () {if (null! = This. httpRequest) {this. httpRequest. Abort ();} return this. CheckGotoRecv & this. DoBetIsGotoRecv ;}}}