Web Spider (Web Crawler) Core C # source code

Source: Internet
Author: User
Web Crawlers or crawlers need to be able to download information such as webpages, images (streams), and logon Cookies. The following C # code is a useful core program. Using System; using System. collections. generic; using System. text; using System. IO; using System. IO. compression; using System. xml; using System. web; using System. collections; using System. runtime. interopServices; using System. net; using System. net. security; using System. security. authentication; using System. security. cryptography. x509Certificates; namespace Common {// <summary> /// prepare POST /// </summary> /// <param Name = "httpRequest"> </param> public delegate void OnGetPostReady (HttpWebRequest httpRequest ); /// <summary> /// prepare for response /// </summary> /// <param name = "httpRequest"> </param> public delegate void OnGetResponseReady (HttpWebRequest httpRequest ); public class HttpWebHelper {protected HttpWebRequest httpRequest; protected HttpWebResponse httpResponse; protected CookieContainer cookieContainer; protected CredentialCa Che credentialCache; protected bool certificatedMode = false; protected string certFilepath = string. empty; public OnGetPostReady Limit = null; protected readonly int DEFAULT_BUFFER_SIZE = 4096; public WebProxy webProxySrv = null; private static readonly int MyConnectionLimit = 300; public bool CheckGotoRecv {get; set;} public bool DoBet IsGotoRecv {get; set;} public bool LastAccessError {private set; get ;} /// <summary> /// url after the current automatic redirection /// </summary> public string CurrentUrl {private set; get;} public string CurrentLocation {private set; get;} public string CurSetCookie {set; get;} public string CurSetCookie2 {set; get ;} /// <summary> /// default constructor /// </summary> public HttpWebHelper () {this. cookieContainer = new CookieContainer (); ServicePointManager. default ConnectionLimit = MyConnectionLimit; ServicePointManager. expect100Continue = false; ServicePointManager. maxServicePointIdleTime = 10000 ;} /// <summary> /// Proxy component number maker /// </summary> /// <param name = "wp"> </param> public HttpWebHelper (webProxy wp): this () {this. webProxySrv = wp ;} /// <summary> /// the constructor that requires basic authentication /// </summary> /// <param name = "cred"> </param> public HttpWebHelper (bool cred): this () {this. certificatedMod E = cred;} public HttpWebHelper (bool cred, WebProxy wp): this () {this. certificatedMode = cred; this. webProxySrv = wp ;}/// <summary> /// Basic Authentication and certificate, refer page // </summary> /// <param name = "cred"> </param> /// <param name = "certFilepath"> </param> public httpWebHelper (bool cred, string certFilepath): this (cred) {this. certFilepath = certFilepath;} public HttpWebHelper (bool cred, WebProxy wp, string certFilepath): thi S (cred, wp) {this. certFilepath = certFilepath ;} /// <summary> /// the constructor that provides the batch user name and password /// </summary> /// <param name = "uri"> </param>/ // <param name = "method"> </param> /// <param name = "username"> </param> /// <param name = "password"> </param> public HttpWebHelper (string uri, string method, string username, string password): this (true) {this. credentialCache = new CredentialCache (); this. credentialCache. add (new Uri (uri), Method, new NetworkCredential (username, password) ;}/// <summary> /// Security query callback function, directly agree /// </summary> /// <param name = "sender"> </param> /// <param name = "certificate"> </param>/ // <param name = "chain"> </param> // <param name = "errors"> </param> // <returns> </returns> public bool CheckValidationResult (object sender, x509Certificate certificate, X509Chain chain, SslPolicyErrors errors) {return true;} private voi D SetHttpRequestOptions_Accept (string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm, string httpAccept) {this. setHttpRequestOptions (url, method, cc, referUrl, nocache, dm); this. httpRequest. accept = httpAccept ;} /// <summary> /// set the HttpWebRequest object /// </summary> /// <param name = "url"> </param> /// <param name = "method"> </param> // <param name = "cc"> </param> /// <Param name = "referUrl"> </param> /// <param name = "nocache"> </param> /// <param name = "dm"> </param> private void SetHttpRequestOptions (string url, string method, CookieCollection cc, string referUrl, bool nocache, DecompressionMethods dm) {httpRequest = (HttpWebRequest) HttpWebRequest. create (url); httpRequest. unsafeAuthenticatedConnectionSharing = true; httpRequest. servicePoint. connectionLimit = MyConne CtionLimit; if (null! = This. webProxySrv) httpRequest. proxy = this. webProxySrv; if (this. certificatedMode & url. toLower (). substring (0, 5 ). equals ("https") {ServicePointManager. serverCertificateValidationCallback = new System. net. security. remoteCertificateValidationCallback (CheckValidationResult); if (null = this. credentialCache) httpRequest. usedefacrecredentials = true; elsehttpRequest. credentials = this. credentialCache; if (! String. encode (this. certFilepath) httpRequest. ClientCertificates. Add (X509Certificate. CreateFromCertFile (this. certFilepath);} httpRequest. CookieContainer = this. cookieContainer; if (! String. isNullOrEmpty (referUrl) httpRequest. referer = referUrl; httpRequest. automaticDecompression = dm; httpRequest. servicePoint. expect100Continue = false; httpRequest. servicePoint. useNagleAlgorithm = false; httpRequest. contentType = "application/x-www-form-urlencoded"; // httpRequest. accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, */*"; // httpRequest. allow WriteStreamBuffering = true; the default value is true // httpRequest. allowAutoRedirect = true; the default value is truehttpRequest. method = method; httpRequest. timeout = ApplicationConfig. HTTP_REQUEST_TIMEOUT; // when the timeout period is exceeded // httpRequest. readWriteTimeout = ApplicationConfig. HTTP_REQUEST_TIMEOUT; // httpRequest. maximumAutomaticRedirections = 50; the default value is 50httpRequest. userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2 ;. net clr 1. 1.4322 ;. net clr 2.0.50727) "; httpRequest. headers. add ("Accept-Language", "zh-cn"); httpRequest. headers. add ("UA-CPU", "x86"); // httpRequest. headers. add ("Accept-Encoding", "gzip, deflate"); if (nocache) {httpRequest. headers. add ("Cache-Control", "no-cache"); // httpRequest. headers. add ("Pragma", "no-cache");} if (null! = Cc) httpRequest. CookieContainer. Add (cc); // callback the event if (null! = This. onGetPostReadyHandler) {try {this. onGetPostReadyHandler (this. httpRequest); // BaseDebug. debuuplint ("KeepAlive =" + this. httpRequest. keepAlive. toString ();} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint (ex. toString () ;}} private void SetHttpRequestOptions (string url, string method, CookieCollection cc, string referUrl, string httpAccept) {this. setHttpRequestOptions_A Ccept (url, method, cc, referUrl, false, DecompressionMethods. GZip | DecompressionMethods. deflate, httpAccept);} // <summary> // reset some members // </summary> private void ManualResetMember () {this. cookieContainer = httpRequest. cookieContainer; this. currentUrl = httpRequest. address. originalString; this. currentLocation = httpResponse. headers ["Location"];} public MemoryStream GetMemoryStream (string url, string Method, CookieCollection cc, string referUrl, string httpAccept) {MemoryStream MS = new MemoryStream (); try {this. setHttpRequestOptions (url, method, cc, referUrl, "*/*"); this. httpRequest. accept = httpAccept; this. httpResponse = (HttpWebResponse) httpRequest. getResponse (); // whether to receive the response if (! This. httpRequest. HaveResponse) {this. httpResponse. Close (); this. httpRequest. Abort (); return MS;} this. ManualResetMember (); if (null! = This. onGetResponseReadyHandler) {try {this. onGetResponseReadyHandler (this. httpRequest);} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint (ex. toString () ;}} this. doBetIsGotoRecv = true; Stream sm = httpResponse. getResponseStream (); if (null! = Sm & sm. CanRead) {BinaryReader br = new BinaryReader (sm); byte [] bytes = br. ReadBytes (DEFAULT_BUFFER_SIZE); while (null! = Bytes & bytes. Length! = 0) {ms. write (bytes, 0, bytes. length); bytes = br. readBytes (DEFAULT_BUFFER_SIZE);} br. close ();} if (httpResponse. headers ["Set-Cookie"]! = Null) this. CurSetCookie = httpResponse. Headers ["Set-Cookie"]. ToString (); httpResponse. Close (); if (null! = Sm) sm. close (); // very important. Return to the beginning of ms. seek (0, SeekOrigin. begin);} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint ("constant website address:" + url); BaseDebug. debuuplint (ex. toString (); if (null! = HttpRequest) httpRequest. abort ();} return MS;} public MemoryStream SimpleGetMemoryStream (string url, string method) {return this. getMemoryStream (url, method, null, null, "text/html");} public MemoryStream SimpleGetMemoryStream (string url, string method, string httpAccept) {return this. getMemoryStream (url, method, null, null, httpAccept);} // <summary> // send requests only, return all output texts /// </summary> /// <param name =" Url "> </param> // <param name =" method "> </param> // <param name =" coding "> </param> /// <param name = "cc"> </param> /// <param name = "referUrl"> </param> /// <returns> </returns> public string SimpleDoPostWrapper (string url, string method, Encoding coding, CookieCollection cc, string referUrl) {string str = string. empty; StreamReader sr = null; MemoryStream sm = null; if (null = coding) {sm = this. getMemoryStream (Url, method, cc, referUrl, "text/html"); sr = new StreamReader (sm);} else {sm = this. getMemoryStream (url, method, cc, referUrl, "text/html"); sr = new StreamReader (sm, coding);} str = sr. readToEnd (); sr. close (); sm. close (); return str;} public string SimpleDoPostWrapper (string url, string method) {return this. simpleDoPostWrapper (url, method, null);} public string SimpleDoPostWrapper (string url, s Tring method, CookieCollection cc) {return this. simpleDoPostWrapper (url, method, null, cc, null);} public string SimpleDoPostWrapper (string url, string method, string referUrl) {return this. simpleDoPostWrapper (url, method, null, null, referUrl) ;}/// <summary> // send data, return output stream // </summary> /// <param name = "url"> </param> // <param name = "data"> </param> /// <param name = "method"> </param> /// <param name = "codin G "> </param> /// <param name =" cc "> </param> // <param name =" referUrl "> </param> /// <returns> </returns> public MemoryStream GetMemoryStream (string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl) {MemoryStream MS = new MemoryStream (); try {this. setHttpRequestOptions (url, method, cc, referUrl, "text/html"); byte [] bytesData = coding. getBytes (data); Stream requestStream = HttpRequest. getRequestStream (); requestStream. write (bytesData, 0, bytesData. length); requestStream. flush (); requestStream. close (); this. httpResponse = (HttpWebResponse) httpRequest. getResponse (); // whether to receive the response if (! This. httpRequest. HaveResponse) {this. httpResponse. Close (); this. httpRequest. Abort (); return MS;} this. ManualResetMember (); if (null! = This. on getresponsereadyhandler) {try {this. onGetResponseReadyHandler (this. httpRequest);} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint (ex. toString () ;}} this. doBetIsGotoRecv = true; Stream sm = httpResponse. getResponseStream (); if (null! = Sm & sm. CanRead) {BinaryReader br = new BinaryReader (sm); byte [] bytes = br. ReadBytes (DEFAULT_BUFFER_SIZE); while (null! = Bytes & bytes. Length! = 0) {ms. write (bytes, 0, bytes. length); bytes = br. readBytes (DEFAULT_BUFFER_SIZE);} br. close ();} if (httpResponse. headers ["Set-Cookie"]! = Null) this. CurSetCookie = httpResponse. Headers ["Set-Cookie"]. ToString (); httpResponse. Close (); if (null! = Sm) sm. close (); // very important. Return to the beginning of ms. seek (0, SeekOrigin. begin);} catch (System. exception ex) {this. lastAccessError = true; BaseDebug. debuuplint ("constant website address:" + url); BaseDebug. debuuplint (ex. toString (); if (null! = HttpRequest) httpRequest. abort ();} return MS;} public MemoryStream SimpleGetMemoryStream (string url, string data, string method, Encoding coding) {return this. getMemoryStream (url, data, method, coding, null, null);} public MemoryStream SimpleGetMemoryStream (string url, string data, string method, Encoding coding, string referUrl) {return this. getMemoryStream (url, data, method, coding, null, referUrl );} /// <Summary> // forward, return all output texts /// </summary> /// <param name = "url"> </param> /// <param name = "data"> </ param> /// <param name = "method"> </param> /// <param name = "coding"> </param> /// <param name =" referUrl "> </param> // <returns> </returns> public string DoPostWrapper (string url, string data, string method, Encoding coding, CookieCollection cc, string referUrl) {string str = string. empty; MemoryStream sm = this. GetMemoryStream (url, data, method, coding, cc, referUrl); StreamReader sr = new StreamReader (sm); str = sr. readToEnd (); sr. close (); sm. close (); return str;} public string DoPostWrapper (string url, string data, string method, Encoding coding) {return this. doPostWrapper (url, data, method, coding, null, null);} public string DoPostWrapper (string url, string data, string method, Encoding coding, CookieCollec Tion cc) {return this. doPostWrapper (url, data, method, coding, cc, null);} public string DoPostWrapper (string url, string data, string method, Encoding coding, string referUrl) {return this. doPostWrapper (url, data, method, coding, null, referUrl) ;}/// <summary> /// returns all output texts, the parameter is a dictionary // </summary> /// <param name = "url"> </param> // <param name = "dicArguments"> </param> /// <param name = "method"> </param> /// <Param name = "coding"> </param> /// <param name = "referUrl"> </param> /// <returns> </returns> public string DoPostWrapper (string url, dictionary <string, string> dicArguments, string method, Encoding coding, CookieCollection cc, string referUrl) {string data = this. buildRequestArguments (dicArguments); return this. doPostWrapper (url, data, method, coding, cc, referUrl);} public string DoPostWrapper (string url, Dictionary <string, string> dicArguments, string method, Encoding coding) {return this. doPostWrapper (url, dicArguments, method, coding, null, null);} public string DoPostWrapper (string url, Dictionary <string, string> dicArguments, string method, Encoding coding, CookieCollection cc) {return this. doPostWrapper (url, dicArguments, method, coding, cc, null);} public string DoPostWrapper (string url, Dictio Nary <string, string> dicArguments, string method, Encoding coding, string referUrl) {return this. doPostWrapper (url, dicArguments, method, coding, null, referUrl);} // <summary> // download the verification code and only return the memory stream, the calling function is responsible for disabling the Stream // </summary> /// <param name = "url"> </param> /// <param name = "method"> </param> /// <returns> </returns> public MemoryStream DownloadStream (string url, string method) {return this. simpleGetMemoryStr Eam (url, method, "*/*");} // <summary> // generate upload parameters from the dictionary. supported encoding customization /// </summary> /// <param name = "dicArguments"> </param> /// <param name = "coding"> </param >/// <returns> </returns> public string BuildRequestArguments (Dictionary <string, string> dicArguments, Encoding coding) {StringBuilder sb = new StringBuilder (); string str = string. empty; if (0 = dicArguments. count) return str; foreach (KeyValuePair <string, s Tring> kvp in dicArguments) {if (null! = Coding) sb. append (HttpUtility. urlEncode (kvp. key, coding) + "=" + HttpUtility. urlEncode (kvp. value, coding); elsesb. append (HttpUtility. urlEncode (kvp. key) + "=" + HttpUtility. urlEncode (kvp. value); // a & bsb. append ("&");} str = sb. toString (); return str. substring (0, str. length-1);} // <summary> // generate the default upload parameter from the dictionary, customization of codes is not supported /// </summary> /// <param name = "dicArguments"> </param> /// <returns> </returns> public String BuildRequestArguments (Dictionary <string, string> dicArguments) {return this. buildRequestArguments (dicArguments, null );} /// <summary> /// query the value of an item in the cookie /// </summary> /// <param name = "key"> </param> // /<param name = "domain"> </param> // <returns> </returns> public string GetCookieValue (string key, string domain) {if (0 = this. cookieContainer. count) {return string. empty;} CookieCollection cc = this. c OokieContainer. getCookies (new Uri (domain); return cc [key]. value ;} /// <summary> /// set the cookie container // </summary> /// <param name = "cc"> </param> public void SetCookieContainer (CookieContainer cc)) {this. cookieContainer = cc;} // <summary> /// for more information, /// </summary> public bool AbortHttpRequest () {if (null! = This. httpRequest) {this. httpRequest. Abort ();} return this. CheckGotoRecv & this. DoBetIsGotoRecv ;}}}
Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.