Related to Regular Expressions: C # Crawling webpages (retrieving all information on webpages)

Last Update:2018-12-03 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Using system; using system. data; using system. configuration; using system. net; using system. io; using system. text; using system. collections. generic; using system. text. regularexpressions; using system. threading; using system. web; using system. web. UI. mobilecontrols; // <summary> // webpage class // </Summary> public class webpage {# region private member private URI m_uri; // URL private list <link> m_links; // The Private st link on this webpage Ring m_title; // The title private string m_html; // the HTML code private string m_outstr; // the plain text private bool m_good that can be output by the webpage; // whether the webpage can be private int m_pagesize; // The size of the webpage Private Static dictionary <string, cookiecontainer> webcookies = new dictionary <string, cookiecontainer> (); // store the cookies of all webpages # endregion # region attributes // <summary> // you can obtain the website address of this webpage through this attribute, read-Only // </Summary> Public String URL {get {return m_uri.absoluteuri ;}} /// <Summary> /// you can obtain the title of this webpage through this attribute, read-Only // </Summary> Public String title {get {If (m_title = "") {RegEx Reg = new RegEx (@"(? M) <title [^>] *> (? <Title> (?: \ W | \ W )*?) </Title [^>] *> ", regexoptions. multiline | regexoptions. ignorecase); match MC = reg. match (m_html); If (MC. success) m_title = MC. groups ["title"]. value. trim () ;}return m_title ;}} Public String m_html {get {If (m_html = NULL) {m_html = "" ;}return m_html ;}} /// <summary> /// this attribute obtains all the link information of this webpage, read-Only /// </Summary> public list <link> links {get {If (m_links.count = 0) getlinks (); Return m_links ;}} /// <Summary> // This attribute returns all plain text information of this webpage, read-Only // </Summary> Public String context {get {If (m_outstr = ") getcontext (int16.maxvalue); Return m_outstr ;}} /// <summary >/// obtain the size of the current page by using this attribute /// </Summary> Public int pagesize {get {return m_pagesize ;}} /// <summary> /// this attribute obtains all the intra-site links on this page. /// </Summary> public list <link> insitelinks {get {return getspeciallinksbyurl ("^ HTTP: // "+ m_uri.host, int16.maxv Alue) ;}/// <summary >/// this attribute indicates whether the webpage is available /// </Summary> Public bool isgood {get {return m_good ;}} /// <summary >/// this attribute indicates the website where the webpage is located /// </Summary> Public String host {get {return m_uri.host ;}} # endregion // <summary> /// analyze the link information from the HTML code /// </Summary> /// <returns> List <link> </returns> private list <link> getlinks () {If (m_links.count = 0) {RegEx [] RegEx = new RegEx [2]; RegEx [0] = new Reg Ex (@ "<A \ shref \ s * = ""(? <URL> [^ ""] *). *?> (? <Title> [^ <] *) </a> ", regexoptions. ignorecase | regexoptions. singleline); RegEx [1] = new RegEx ("<[I] * frame [^> <] + src = (\" | ')? (? <URL> ([^> \ "'\ s)]) +) (\" | ')? [^>] *> ", Regexoptions. ignorecase); For (INT I = 0; I <2; I ++) {match = RegEx [I]. match (m_html); While (match. success) {try {string url = httputility. urldecode (New uri (m_uri, match. groups ["url"]. value ). absoluteuri); string text = ""; if (I = 0) TEXT = new RegEx ("(<[^>] +>) | (\ s) | () | & | \ "", regexoptions. multiline | regexoptions. ignorecase ). replace (match. groups ["text"]. value, ""); Link link = New Link (); Link. TEXT = text; Link. navigateurl = URL; m_links.add (Link);} catch (exception ex) {console. writeline (ex. message) ;}; match = match. nextmatch () ;}}return m_links ;} /// <summary> /// this private method extracts a certain number of words of plain text from an HTML text. /// </Summary> /// <Param name = "instr "> HTML code </param> /// <Param name =" firstn "> Number of extracted words from the header </param> /// <Param name =" withlink "> whether to link the text in </param> /// <returns> plain text </returns> PR Ivate string getfirstnchar (string instr, int firstn, bool withlink) {If (m_outstr = "") {m_outstr = instr. clone () as string; m_outstr = new RegEx (@"(? M) <SCRIPT [^>] *> (\ w | \ W )*? </Script [^>] *> ", regexoptions. multiline | regexoptions. ignorecase). Replace (m_outstr," "); m_outstr = new RegEx (@"(? M) <style [^>] *> (\ w | \ W )*? </Style [^>] *> ", regexoptions. multiline | regexoptions. ignorecase). Replace (m_outstr," "); m_outstr = new RegEx (@"(? M) <select [^>] *> (\ w | \ W )*? </Select [^>] *> ", regexoptions. multiline | regexoptions. ignorecase). Replace (m_outstr," "); If (! Withlink) m_outstr = new RegEx (@"(? M) <A [^>] *> (\ w | \ W )*? </A [^>] *> ", regexoptions. multiline | regexoptions. ignorecase ). replace (m_outstr, ""); RegEx objreg = new system. text. regularexpressions. regEx ("(<[^>] +?>) | ", Regexoptions. multiline | regexoptions. ignorecase); m_outstr = objreg. replace (m_outstr, ""); RegEx objreg2 = new system. text. regularexpressions. regEx ("(\ s) +", regexoptions. multiline | regexoptions. ignorecase); m_outstr = objreg2.replace (m_outstr, "");} return m_outstr.length> firstn? M_outstr.substring (0, firstn): m_outstr ;}# Region Public syntax /// <summary> /// this public method extracts plain text of certain words in a webpage, including link text /// </Summary> /// <Param name = "firstn"> words </param> /// <returns> </returns> Public String getcontext (int firstn) {return getfirstnchar (m_html, firstn, true) ;}/// <summary> // This public method extracts a certain number of links from the links on this page, the URL of this link meets a regular expression // </Summary> // <Param name = "pattern"> Regular Expression </param> /// <Param name =" count "> Number of returned links Count </param> /// <returns> List <link> </returns> public list <link> getspeciallinksbyurl (string pattern, int count) {If (m_links.count = 0) getlinks (); List <link> speciallinks = new list <link> (); List <link>. enumerator I; I = m_links.getenumerator (); int CNT = 0; while (I. movenext () & CNT <count) {If (New RegEx (pattern, regexoptions. multiline | regexoptions. ignorecase ). match (I. current. navigateurl ). s Uccess) {speciallinks. add (I. current); CNT ++ ;}}return speciallinks ;}/// <summary >/// this public method extracts a certain number of links from the links on this page, the text of this link meets a regular expression // </Summary> /// <Param name = "pattern"> Regular Expression </param> /// <Param name =" count "> Number of returned Links </param> // <returns> List <link> </returns> public list <link> getspeciallinksbytext (string pattern, int count) {If (m_links.count = 0) getlinks (); List <link> speciallinks = new list <link> (); List <link>. enumerator I; I = m_links.getenumerator (); int CNT = 0; while (I. movenext () & CNT <count) {If (New RegEx (pattern, regexoptions. multiline | regexoptions. ignorecase ). match (I. current. text ). success) {speciallinks. add (I. current); CNT ++;} return speciallinks ;} /// <summary> /// this public method extracts text that satisfies a regular expression in the plain text of this webpage. /// </Summary> /// <Param name =" pattern "> Regular Expression </param> // <returns> return text </returns> Public String getspecialwords (string pattern) {If (m_outstr = "") getcontext (int16.maxvalue); RegEx = new RegEx (pattern, regexoptions. multiline | regexoptions. ignorecase); match MC = RegEx. match (m_outstr); If (MC. success) return MC. groups [1]. value; return string. empty ;}# endregion # region constructor private void Init (string _ URL) {try {m_uri = new uri (_ URL); m_links = new list <link> (); m_ht ML = ""; m_outstr = ""; m_title = ""; m_good = true; If (_ URL. endswith (". RAR ") | _ URL. endswith (". dat ") | _ URL. endswith (". msi ") {m_good = false; return;} httpwebrequest rqst = (httpwebrequest) webrequest. create (m_uri); rqst. allowautoredirect = true; rqst. maximumautomaticredirections = 3; rqst. useragent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"; rqst. keepalive = true; rqst. timeout = 10000; lock (webpage. webcookies) {If (webpage. webcookies. containskey (m_uri.host) rqst. cookiecontainer = webpage. webcookies [m_uri.host]; else {cookiecontainer cc = new cookiecontainer (); webpage. webcookies [m_uri.host] = cc; rqst. cookiecontainer = Cc ;}} httpwebresponse RSPs = (httpwebresponse) rqst. getresponse (); stream Sm = RSPs. getresponsestream (); If (! RSPs. contenttype. tolower (). startswith ("text/") | RSPs. contentlength> 1 <22) {RSPs. close (); m_good = false; return;} encoding cding = system. text. encoding. default; string contenttype = RSPs. contenttype. tolower (); int IX = contenttype. indexof ("charset ="); If (Ix! =-1) {try {cding = system. text. encoding. getencoding (RSPs. contenttype. substring (IX + "charset ". length + 1);} catch {cding = encoding. default;} // The decoded result depends on the actual situation. // m_html = httputility. htmldecode (New streamreader (SM, cding ). readtoend (); m_html = new streamreader (SM, cding ). readtoend ();} else {// The location may need to be decoded based on the actual situation. // m_html = httputility. htmldecode (New streamreader (SM, cding ). readtoend (); m_html = New streamreader (SM, cding). readtoend (); RegEx = new RegEx ("charset = (? <Cding> [^ =] + )? \ "", Regexoptions. ignorecase); string strcding = RegEx. match (m_html ). groups ["cding"]. value; try {cding = encoding. getencoding (strcding);} catch {cding = encoding. default;} byte [] bytes = encoding. default. getbytes (m_html.tochararray (); m_html = cding. getstring (bytes); If (m_html.split ('? '). Length> 100) {m_html = encoding. default. getstring (bytes) ;}}m_pagesize = m_html.length; m_uri = RSPs. responseuri; RSPs. close () ;}catch (exception ex) {}} public webpage (string _ URL) {string uurl = ""; try {uurl = Uri. unescapedatastring (_ URL); _ url = uurl;} catch {}; Init (_ URL) ;}# endregion}

Call

Webpage webinfo = new webpage ("url"); webinfo. Context; // all content without HTML tags webinfo. m_html; // content containing HTML tags... reference attributes

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Related to Regular Expressions: C # Crawling webpages (retrieving all information on webpages)

Contact Us

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support