C # HttpWebRequest Stunt get web page information based on URL address

Source: Internet
Author: User

Tags:

If you want to use the middle method, you can access my help class completely free open Source: C # Httphelper, help classes, real HttpRequest requests ignore the code, ignore the certificate, ignore the cookie, Web page crawl 1. First move, get web page information according to URL address                First look at the code get method copy code publicstaticstring geturltohtml (String url,string type) {try {                System.Net.WebRequest WReq = System.Net.WebRequest.Create (URL);                Get the response instance.                System.Net.WebResponse Wresp = Wreq.getresponse ();                System.IO.Stream Respstream = Wresp.getresponsestream (); Dim reader as StreamReader = new StreamReader (respstream) using (System.IO.StreamReader reader = new Sys Tem. Io. StreamReader (Respstream, encoding.getencoding (type))) {return reader.                ReadToEnd (); }} catch (System.Exception ex) {//errormsg = ex.            Message;        } return ""; Copy code post method copy code///<summary>///Use HTTPS protocol to access network///</summary>///<param name= "URL" >url address </param>///<param name= "Strpostdata" > Sent data </param>///<r        Eturns></returns> publicstring Openreadwithhttps (string url,string strpostdata,string strEncoding)            {Encoding Encoding = Encoding.default;            HttpWebRequest request = (HttpWebRequest) webrequest.create (URL); Request.            Method = "POST"; Request.            Accept = "text/html, Application/xhtml+xml, */*"; Request.            ContentType = "application/x-www-form-urlencoded"; Byte[] buffer= encoding.            GetBytes (Strpostdata); Request. contentlength = buffer.            Length; Request. GetRequestStream (). Write (buffer, 0, buffer.            Length); HttpWebResponse response = (HttpWebResponse) request.            GetResponse (); using (StreamReader Reader =new StreamReader (response. GetResponseStream (), System.Text.Encoding.GetEncoding (strencoding)) {return reader.       ReadToEnd ();       }} Copy code this is the first form of entry, features: 1. The simplest and most intuitive one, introductory course.   2. Adapt to clear text, no need to log in, no Authentication required to enter the page.   3. The data type obtained is an HTML document. 4. The request method is get/post2. The second trick is to get the Web page information that needs to be verified by the URL address first, take a look at the code get method copy code//callback Validation certificate problem Publicbool CheckValidationResult (objec T sender, X509Certificate certificate, X509chain chain, sslpolicyerrors errors) {//Always accept Returntru        E }///<summary>///incoming URL returns HTML code for Web page///</summary>//<param name= "URL" >ur            l</param>//<returns></returns> publicstring geturltohtml (string Url) {            StringBuilder content =new StringBuilder (); try {//This sentence must be written before creating the connection.                Use the callback method for certificate validation. Servicepointmanager.servercertificatevalidationcallback=new                System.Net.Security.RemoteCertificateValidationCallback (CheckValidationResult);     Creates an HTTP request with the specified URL HttpWebRequest request = (HttpWebRequest) webrequest.create (URL);           Create a certificate file X509Certificate objx509 = new X509Certificate (application.startuppath+ "\\123.cer"); Add to request.                Clientcertificates.add (objx509); Gets the response of the corresponding HTTP request httpwebresponse response = (HttpWebResponse) request.                GetResponse (); Gets the response stream stream Responsestream = response.                GetResponseStream ();                Docking response flow (in "GBK" character set) StreamReader Sreader =new StreamReader (Responsestream, encoding.getencoding ("Utf-8"));                Start reading data char[] Sreaderbuffer =new char[256];                int count= sreader.read (sreaderbuffer,0,256);                    while (count>0) {string TempStr =new string (sreaderbuffer,0, count); Content.                    Append (TEMPSTR);                Count = Sreader.read (sreaderbuffer,0,256);            }//Read end sreader.close ();       }     catch (Exception) {content =new StringBuilder ("Runtime Error"); } return content.        ToString ();  Copy code post method copy code//callback Validation certificate problem publicbool CheckValidationResult (object sender, X509Certificate certificate, X509chain        Chain, sslpolicyerrors errors) {//Always accept returntrue; }///<summary>///Use HTTPS protocol to access network///</summary>///<param name= "URL" >url address <        /param>///<param name= "Strpostdata" > Data sent </param>///<returns></returns> Publicstring Openreadwithhttps (String url,string strpostdata,string strencoding) {//This sentence must be written in front of the create connection.            Use the callback method for certificate validation. Servicepointmanager.servercertificatevalidationcallback=new            System.Net.Security.RemoteCertificateValidationCallback (CheckValidationResult);            Encoding Encoding = Encoding.default; HttpWebRequest request = (HttpWebRequest)WebRequest.Create (URL);            Create a certificate file X509Certificate objx509 =new x509certificate (application.startuppath+ "\\123.cer"); Load the cookie request.            Cookiecontainer =new Cookiecontainer (); Add to request.            Clientcertificates.add (objx509); Request.            Method = "POST"; Request.            Accept = "text/html, Application/xhtml+xml, */*"; Request.            ContentType = "application/x-www-form-urlencoded"; Byte[] buffer= encoding.            GetBytes (Strpostdata); Request. contentlength = buffer.            Length; Request. GetRequestStream (). Write (buffer, 0, buffer.            Length); HttpWebResponse response = (HttpWebResponse) request.            GetResponse (); using (StreamReader Reader =new StreamReader (response. GetResponseStream (), System.Text.Encoding.GetEncoding (strencoding)) {return reader.               ReadToEnd (); }} Copy code this is learned to be admitted into the door, all need to verify the certificate to enter the page can use this method to enter, I use the certificate callback authentication method, the certificateVerify whether through the client authentication, so that we can use their own definition of a method to verify, some people will say that it is not clear how to verify Ah, the other is very simple, the code is to write their own why should be so difficult for their own, directly return a true is not finished, will always be verified through,   This can ignore the existence of the certificate, features: 1. Into the front of the small problem, the beginner course.   2. Suitable for pages that require no login, clear text but need to verify the certificate to access.   3. The data type obtained is an HTML document. 4. The request method is get/post3. Third recruit, according to the URL address to obtain the need to login to access the Web page information we first analyze this type of Web page, need to log in to access the Web page, the other is also a kind of authentication, verify what, verify whether the client is logged in, with the corresponding credentials, Need to login to verify SessionID this is every need to log on the page needs to be verified, then how we do, our first step is to have the existence of cookies in the data including SessionID, how to get it, this method is many, Using ID9 or Firefox is easy to get, you can refer to my article to provide a page crawl hao123 mobile phone number attribution to the example of this in the ID9 have detailed instructions.   If we get the cookie information that is logged in, then it will be very easy to access the corresponding page, and the other is to put the local cookie information on the request to the past. See Code get method View Code///<summary>///incoming URL returns HTML code for Web page with certificate method//</summary>//<param NA        Me= "url" >URL</param>///<returns></returns> publicstring geturltohtml (string url)            {StringBuilder content =new StringBuilder (); try {//create HTTP request with specified URL HttpWebRequest request = (HttpWebRequest) webrequest.creat       E (URL);         Request. useragent = "mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; trident/5.0; BOIE9;                ZHCN) "; Request.                Method = "GET"; Request.                Accept = "*/*"; If the method validates the source of the page, add this sentence if you do not verify it, you can not write the request.                Referer = "http://sufei.cnblogs.com";                Cookiecontainer objcok =new Cookiecontainer (); Objcok.                ADD (New Uri ("http://sufei.cnblogs.com"), New Cookie ("Key", "value")); Objcok.                ADD (New Uri ("http://sufei.cnblogs.com"), New Cookie ("Key", "value")); Objcok.                ADD (New Uri ("http://sufei.cnblogs.com"), New Cookie ("Sidi_sessionid", "360a748941d055bee8c960168c3d4233")); Request.                Cookiecontainer = Objcok; Do not keep the connection request.                KeepAlive =true; Gets the response of the corresponding HTTP request httpwebresponse response = (HttpWebResponse) request.                GetResponse (); Gets the response stream stream Responsestream = response.                GetResponseStream (); Docking response stream (in the "GBK" character set)                StreamReader sreader =new StreamReader (Responsestream, encoding.getencoding ("gb2312"));                Start reading data char[] Sreaderbuffer =new char[256];                int count= sreader.read (sreaderbuffer,0,256);                    while (count>0) {string TempStr =new string (sreaderbuffer,0, count); Content.                    Append (TEMPSTR);                Count = Sreader.read (sreaderbuffer,0,256);            }//Read end sreader.close ();            } catch (Exception) {content =new StringBuilder ("Runtime Error"); } return content.        ToString (); }post method. View code///<summary>///Use HTTPS protocol to access network///</summary>///<param name= "URL" >url address <        /param>///<param name= "Strpostdata" > Data sent </param>///<returns></returns> Publicstring Openreadwithhttps (StRing url,string strpostdata) {Encoding Encoding = Encoding.default;            HttpWebRequest request = (HttpWebRequest) webrequest.create (URL); Request.            Method = "POST"; Request.            Accept = "text/html, Application/xhtml+xml, */*"; Request.            ContentType = "application/x-www-form-urlencoded";            Cookiecontainer objcok =new Cookiecontainer (); Objcok.            ADD (New Uri ("http://sufei.cnblogs.com"), New Cookie ("Key", "value")); Objcok.            ADD (New Uri ("http://sufei.cnblogs.com"), New Cookie ("Key", "value")); Objcok.            ADD (New Uri ("http://sufei.cnblogs.com"), New Cookie ("Sidi_sessionid", "360a748941d055bee8c960168c3d4233")); Request.            Cookiecontainer = Objcok; Byte[] buffer= encoding.            GetBytes (Strpostdata); Request. contentlength = buffer.            Length; Request. GetRequestStream (). Write (buffer, 0, buffer.            Length); HttpWebResponse response = (HttpWebResponse) request.            GetResponse (); StreamreadER reader =new StreamReader (response.            GetResponseStream (), System.Text.Encoding.GetEncoding ("Utf-8")); Return reader.        ReadToEnd ();   } Features: 1. Still a little water type, after the success of the practice can be a calf.   2. Adapt to pages that need to be logged in to access.   3. The data type obtained is an HTML document. 4. Request method for Get/post summary, other basic skills in these parts, if you go deeper, that is the combination of basic skills such as, 1. Use the Get or post method to log in and then get a cookie to access the page to get information, this other is the combination of the above skills, here need to do such a step after the request response. Cookies This is the way that you can get a cookie when you request it, just get back to the previous method and use it, and we're all built on it, so we can use this cookie directly here. 2. If we come across a webpage that needs to be logged in and have to verify the certificate, the other one is simple. Here's the code here. Here's the same approach to the post example of Get for example: View Code///<summary>///Incoming U RL returns HTML code for Web page///</summary>//<param name= "URL" >URL</param>//<returns>& Lt;/returns> publicstring geturltohtml (string Url) {StringBuilder content =new StringBuilder (            ); try {//This sentence must be written before creating the connection.                Use the callback method for certificate validation. Servicepointmanager.servercertificatevalidationcallback=new System.Net.Security.RemoteCertificateValidationCaLlback (CheckValidationResult);                Creates an HTTP request with the specified URL HttpWebRequest request = (HttpWebRequest) webrequest.create (URL);                Create a certificate file X509Certificate objx509 = new X509Certificate (application.startuppath+ "\\123.cer"); Add to request.                Clientcertificates.add (objx509);                Cookiecontainer objcok =new Cookiecontainer (); Objcok.                ADD (New Uri ("http://www.cnblogs.com"), New Cookie ("Key", "value")); Objcok.                ADD (New Uri ("http://www.cnblogs.com"), New Cookie ("Key", "value")); Objcok.                ADD (New Uri ("http://www.cnblogs.com"), New Cookie ("Sidi_sessionid", "360a748941d055bee8c960168c3d4233")); Request.                Cookiecontainer = Objcok; Gets the response of the corresponding HTTP request httpwebresponse response = (HttpWebResponse) request.                GetResponse (); Gets the response stream stream Responsestream = response.                GetResponseStream ();           Docking response stream (in the "GBK" character set)     StreamReader sreader =new StreamReader (Responsestream, encoding.getencoding ("Utf-8"));                Start reading data char[] Sreaderbuffer =new char[256];                int count= sreader.read (sreaderbuffer,0,256);                    while (count>0) {string TempStr =new string (sreaderbuffer,0, count); Content.                    Append (TEMPSTR);                Count = Sreader.read (sreaderbuffer,0,256);            }//Read end sreader.close ();            } catch (Exception) {content =new StringBuilder ("Runtime Error"); } return content.        ToString (); }3. What if we come across a method that needs to verify the source of the Web page, and the other is that some programmers will think that you might use the program to automatically get the page information, in order to prevent the use of page sources to verify, that is, as long as not from their page or the domain name of the request is not accepted, Some are direct authentication source IP, these can use the following sentence to enter, this is mainly the address can be directly forged request. Referer = "http://sufei.cnblogs.com"; hehe other very simple because this address can be directly modified. But if the server is verifying the URL of the source then it is over, we have to modify the packet, this is a bit difficult to discuss for the time being. 4. Provide some of the methods configured with this example to filter the HTML Label Method View Code///<summary>///filter HTML tags///</summary>//<param name= "strHTML" ; HTML content </param>///<returns></returns> publicstaticstring striphtml (String stringToStrip ) {//paring using RegEx//stringToStrip = Regex.Replace (stringToStrip, "</p (?: \ \s*) > (?: \ \s*) <p (?: \ \s*) > "," \ n \ regexoptions.ignorecase| ",            regexoptions.compiled); stringToStrip = Regex.Replace (stringToStrip, "<br" (?: \ \s*)/> "," \ n ", regexoptions.ignorecase|            regexoptions.compiled); stringToStrip = Regex.Replace (stringToStrip, "\" "," "", regexoptions.ignorecase|            regexoptions.compiled);            stringToStrip = Striphtmlxmltags (stringToStrip);        return stringToStrip; } privatestaticstring Striphtmlxmltags (string content) {return Regex.Replace (content, "<[^> ]+> "," ", regexoptions.ignorecase|        regexoptions.compiled);    } Method of URL conversion copy Code #region conversion URL publicstaticstring UrlDecode (string text) {return Httputility.urld        Ecode (text, encoding.default); } publicstaticstring UrlEncode (string text) {return Httputility.urlencode (text, Encoding.default        ); } #endregion复制代码提供一个实际例子, this is the use of IP138 to query the location of the mobile phone number of the method, the other in my last article has, here I put up is convenient for everyone to read, this aspect of the technical other research is very interesting, I hope that we have more suggestions, I believe there should be more better, more perfect methods, here to provide you with a reference bar. Thank you for supporting the example copy code///<summary>///input mobile number to get attribution information///</summary>//<param name= "numb ER "> Phone number </param>//<returns> array type 0 for attribution, 1 card type, 2 area code, 3 postcode </returns> publicstaticstring[] G Ettelldate (string number) {try {string strsource= geturltohtml ("http://www.i P138.com:8080/search.asp?action=mobile&mobile= "+ number.                Trim ());                Place of Attribution strsource = Strsource.substring (strsource.indexof (number)); Strsource = StripHTML (strsource);                strsource = Strsource.replace ("\ R", "");                strsource = Strsource.replace ("\ n", "");                strsource = Strsource.replace ("\ T", "");                strsource = Strsource.replace (" ", "");                strsource = Strsource.replace ("--", "" "); String[] strnumber= strsource.split (newstring[] {"Place of Attribution", "card type", "Zip Code", "Area code", "more detailed", "card number"},                Stringsplitoptions.removeemptyentries);                String[] Strnumber1=null; if (Strnumber. length>4) {Strnumber1 =newstring[] {strnumber[1]. Trim (), strnumber[2]. Trim (), strnumber[3]. Trim (), strnumber[4].                Trim ()};            } return strnumber1;            } catch (Exception) {returnnull; }} Copy Code This example writes is not good, some places can be simplified, this interface and can be directly using XML to get, but my focus here is to let some novice look at the methods and ideas cool Ah, oh, the four strokes, Access----------------------------------------------------------------------------------------through the socket--------------------Copy Code///<summary>///Request public class used to send request to server///</summary>///<param Nam E= "Strsmsrequest" > Send request String </param>///<returns> returns the requested information </returns> private static Strin            G Smsrequest (String strsmsrequest) {byte[] data = new byte[1024];            string stringdata = null;            Iphostentry gist = Dns.gethostbyname ("www.110.cn"); IPAddress IP = gist.            ADDRESSLIST[0];            Get IP IPEndPoint ipend = new IPEndPoint (IP, 3121);            Default 80 port number Socket SOCKET = new socket (addressfamily.internetwork, SocketType.Stream, protocoltype.tcp); Use the TCP protocol stream type try {socket.            Connect (Ipend); } catch (SocketException ex) {return "Fail to connect server\r\n" + ex.            ToString (); } String path = Strsmsrequest.tostring ().            Trim (); StringbuildeR buf = new StringBuilder (); Buf. Append ("GET"). Append (Path).            Append ("http/1.0\r\n"); Buf.            Append ("content-type:application/x-www-form-urlencoded\r\n"); Buf.            Append ("\ r \ n"); Byte[] ms = System.Text.UTF8Encoding.UTF8.GetBytes (buf.            ToString ()); Submits the requested information to the socket.            Send (MS);            Receive returns String strsms = "";            int recv = 0; do {recv = socket.                Receive (data);                StringData = Encoding.ASCII.GetString (data, 0, recv);                If the page encoding specified in the requested page meta is gb2312, the corresponding encoding is required to convert the bytes () strsms = strsms + stringdata; Strsms + = recv.            ToString ();            } while (recv! = 0); Socket.            Shutdown (Socketshutdown.both); Socket.            Close ();        return strsms; }

C # HttpWebRequest Stunt get web page information based on URL address

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

Tags Index: