Using system; using system. data; using system. configuration; using system. net; using system. io; using system. text; using system. collections. generic; using system. text. regularexpressions; using system. threading; using system. web; using system. web. UI. mobilecontrols; // <summary> // webpage class // </Summary> public class webpage {# region private member private URI m_uri; // URL private list <link> m_links; // The Private st link on this webpage Ring m_title; // The title private string m_html; // the HTML code private string m_outstr; // the plain text private bool m_good that can be output by the webpage; // whether the webpage can be private int m_pagesize; // The size of the webpage Private Static dictionary <string, cookiecontainer> webcookies = new dictionary <string, cookiecontainer> (); // store the cookies of all webpages # endregion # region attributes // <summary> // you can obtain the website address of this webpage through this attribute, read-Only // </Summary> Public String URL {get {return m_uri.absoluteuri ;}} /// <Summary> /// you can obtain the title of this webpage through this attribute, read-Only // </Summary> Public String title {get {If (m_title = "") {RegEx Reg = new RegEx (@"(? M) <title [^>] *> (? <Title> (?: \ W | \ W )*?) </Title [^>] *> ", regexoptions. multiline | regexoptions. ignorecase); match MC = reg. match (m_html); If (MC. success) m_title = MC. groups ["title"]. value. trim () ;}return m_title ;}} Public String m_html {get {If (m_html = NULL) {m_html = "" ;}return m_html ;}} /// <summary> /// this attribute obtains all the link information of this webpage, read-Only /// </Summary> public list <link> links {get {If (m_links.count = 0) getlinks (); Return m_links ;}} /// <Summary> // This attribute returns all plain text information of this webpage, read-Only // </Summary> Public String context {get {If (m_outstr = ") getcontext (int16.maxvalue); Return m_outstr ;}} /// <summary >/// obtain the size of the current page by using this attribute /// </Summary> Public int pagesize {get {return m_pagesize ;}} /// <summary> /// this attribute obtains all the intra-site links on this page. /// </Summary> public list <link> insitelinks {get {return getspeciallinksbyurl ("^ HTTP: // "+ m_uri.host, int16.maxv Alue) ;}/// <summary >/// this attribute indicates whether the webpage is available /// </Summary> Public bool isgood {get {return m_good ;}} /// <summary >/// this attribute indicates the website where the webpage is located /// </Summary> Public String host {get {return m_uri.host ;}} # endregion // <summary> /// analyze the link information from the HTML code /// </Summary> /// <returns> List <link> </returns> private list <link> getlinks () {If (m_links.count = 0) {RegEx [] RegEx = new RegEx [2]; RegEx [0] = new Reg Ex (@ "<A \ shref \ s * = ""(? <URL> [^ ""] *). *?> (? <Title> [^ <] *) </a> ", regexoptions. ignorecase | regexoptions. singleline); RegEx [1] = new RegEx ("<[I] * frame [^> <] + src = (\" | ')? (? <URL> ([^> \ "'\ s)]) +) (\" | ')? [^>] *> ", Regexoptions. ignorecase); For (INT I = 0; I <2; I ++) {match = RegEx [I]. match (m_html); While (match. success) {try {string url = httputility. urldecode (New uri (m_uri, match. groups ["url"]. value ). absoluteuri); string text = ""; if (I = 0) TEXT = new RegEx ("(<[^>] +>) | (\ s) | () | & | \ "", regexoptions. multiline | regexoptions. ignorecase ). replace (match. groups ["text"]. value, ""); Link link = New Link (); Link. TEXT = text; Link. navigateurl = URL; m_links.add (Link);} catch (exception ex) {console. writeline (ex. message) ;}; match = match. nextmatch () ;}}return m_links ;} /// <summary> /// this private method extracts a certain number of words of plain text from an HTML text. /// </Summary> /// <Param name = "instr "> HTML code </param> /// <Param name =" firstn "> Number of extracted words from the header </param> /// <Param name =" withlink "> whether to link the text in </param> /// <returns> plain text </returns> PR Ivate string getfirstnchar (string instr, int firstn, bool withlink) {If (m_outstr = "") {m_outstr = instr. clone () as string; m_outstr = new RegEx (@"(? M) <SCRIPT [^>] *> (\ w | \ W )*? </Script [^>] *> ", regexoptions. multiline | regexoptions. ignorecase). Replace (m_outstr," "); m_outstr = new RegEx (@"(? M) <style [^>] *> (\ w | \ W )*? </Style [^>] *> ", regexoptions. multiline | regexoptions. ignorecase). Replace (m_outstr," "); m_outstr = new RegEx (@"(? M) <select [^>] *> (\ w | \ W )*? </Select [^>] *> ", regexoptions. multiline | regexoptions. ignorecase). Replace (m_outstr," "); If (! Withlink) m_outstr = new RegEx (@"(? M) <A [^>] *> (\ w | \ W )*? </A [^>] *> ", regexoptions. multiline | regexoptions. ignorecase ). replace (m_outstr, ""); RegEx objreg = new system. text. regularexpressions. regEx ("(<[^>] +?>) | ", Regexoptions. multiline | regexoptions. ignorecase); m_outstr = objreg. replace (m_outstr, ""); RegEx objreg2 = new system. text. regularexpressions. regEx ("(\ s) +", regexoptions. multiline | regexoptions. ignorecase); m_outstr = objreg2.replace (m_outstr, "");} return m_outstr.length> firstn? M_outstr.substring (0, firstn): m_outstr ;}# Region Public syntax /// <summary> /// this public method extracts plain text of certain words in a webpage, including link text /// </Summary> /// <Param name = "firstn"> words </param> /// <returns> </returns> Public String getcontext (int firstn) {return getfirstnchar (m_html, firstn, true) ;}/// <summary> // This public method extracts a certain number of links from the links on this page, the URL of this link meets a regular expression // </Summary> // <Param name = "pattern"> Regular Expression </param> /// <Param name =" count "> Number of returned links Count </param> /// <returns> List <link> </returns> public list <link> getspeciallinksbyurl (string pattern, int count) {If (m_links.count = 0) getlinks (); List <link> speciallinks = new list <link> (); List <link>. enumerator I; I = m_links.getenumerator (); int CNT = 0; while (I. movenext () & CNT <count) {If (New RegEx (pattern, regexoptions. multiline | regexoptions. ignorecase ). match (I. current. navigateurl ). s Uccess) {speciallinks. add (I. current); CNT ++ ;}}return speciallinks ;}/// <summary >/// this public method extracts a certain number of links from the links on this page, the text of this link meets a regular expression // </Summary> /// <Param name = "pattern"> Regular Expression </param> /// <Param name =" count "> Number of returned Links </param> // <returns> List <link> </returns> public list <link> getspeciallinksbytext (string pattern, int count) {If (m_links.count = 0) getlinks (); List <link> speciallinks = new list <link> (); List <link>. enumerator I; I = m_links.getenumerator (); int CNT = 0; while (I. movenext () & CNT <count) {If (New RegEx (pattern, regexoptions. multiline | regexoptions. ignorecase ). match (I. current. text ). success) {speciallinks. add (I. current); CNT ++;} return speciallinks ;} /// <summary> /// this public method extracts text that satisfies a regular expression in the plain text of this webpage. /// </Summary> /// <Param name =" pattern "> Regular Expression </param> // <returns> return text </returns> Public String getspecialwords (string pattern) {If (m_outstr = "") getcontext (int16.maxvalue); RegEx = new RegEx (pattern, regexoptions. multiline | regexoptions. ignorecase); match MC = RegEx. match (m_outstr); If (MC. success) return MC. groups [1]. value; return string. empty ;}# endregion # region constructor private void Init (string _ URL) {try {m_uri = new uri (_ URL); m_links = new list <link> (); m_ht ML = ""; m_outstr = ""; m_title = ""; m_good = true; If (_ URL. endswith (". RAR ") | _ URL. endswith (". dat ") | _ URL. endswith (". msi ") {m_good = false; return;} httpwebrequest rqst = (httpwebrequest) webrequest. create (m_uri); rqst. allowautoredirect = true; rqst. maximumautomaticredirections = 3; rqst. useragent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"; rqst. keepalive = true; rqst. timeout = 10000; lock (webpage. webcookies) {If (webpage. webcookies. containskey (m_uri.host) rqst. cookiecontainer = webpage. webcookies [m_uri.host]; else {cookiecontainer cc = new cookiecontainer (); webpage. webcookies [m_uri.host] = cc; rqst. cookiecontainer = Cc ;}} httpwebresponse RSPs = (httpwebresponse) rqst. getresponse (); stream Sm = RSPs. getresponsestream (); If (! RSPs. contenttype. tolower (). startswith ("text/") | RSPs. contentlength> 1 <22) {RSPs. close (); m_good = false; return;} encoding cding = system. text. encoding. default; string contenttype = RSPs. contenttype. tolower (); int IX = contenttype. indexof ("charset ="); If (Ix! =-1) {try {cding = system. text. encoding. getencoding (RSPs. contenttype. substring (IX + "charset ". length + 1);} catch {cding = encoding. default;} // The decoded result depends on the actual situation. // m_html = httputility. htmldecode (New streamreader (SM, cding ). readtoend (); m_html = new streamreader (SM, cding ). readtoend ();} else {// The location may need to be decoded based on the actual situation. // m_html = httputility. htmldecode (New streamreader (SM, cding ). readtoend (); m_html = New streamreader (SM, cding). readtoend (); RegEx = new RegEx ("charset = (? <Cding> [^ =] + )? \ "", Regexoptions. ignorecase); string strcding = RegEx. match (m_html ). groups ["cding"]. value; try {cding = encoding. getencoding (strcding);} catch {cding = encoding. default;} byte [] bytes = encoding. default. getbytes (m_html.tochararray (); m_html = cding. getstring (bytes); If (m_html.split ('? '). Length> 100) {m_html = encoding. default. getstring (bytes) ;}}m_pagesize = m_html.length; m_uri = RSPs. responseuri; RSPs. close () ;}catch (exception ex) {}} public webpage (string _ URL) {string uurl = ""; try {uurl = Uri. unescapedatastring (_ URL); _ url = uurl;} catch {}; Init (_ URL) ;}# endregion}
Call
Webpage webinfo = new webpage ("url"); webinfo. Context; // all content without HTML tags webinfo. m_html; // content containing HTML tags... reference attributes