Related to Regular Expressions: C # Crawling webpages (retrieving all information on webpages)

Source: Internet
Author: User

 

Using system; using system. data; using system. configuration; using system. net; using system. io; using system. text; using system. collections. generic; using system. text. regularexpressions; using system. threading; using system. web; using system. web. UI. mobilecontrols; // <summary> // webpage class // </Summary> public class webpage {# region private member private URI m_uri; // URL private list <link> m_links; // The Private st link on this webpage Ring m_title; // The title private string m_html; // the HTML code private string m_outstr; // the plain text private bool m_good that can be output by the webpage; // whether the webpage can be private int m_pagesize; // The size of the webpage Private Static dictionary <string, cookiecontainer> webcookies = new dictionary <string, cookiecontainer> (); // store the cookies of all webpages # endregion # region attributes // <summary> // you can obtain the website address of this webpage through this attribute, read-Only // </Summary> Public String URL {get {return m_uri.absoluteuri ;}} /// <Summary> /// you can obtain the title of this webpage through this attribute, read-Only // </Summary> Public String title {get {If (m_title = "") {RegEx Reg = new RegEx (@"(? M) <title [^>] *> (? <Title> (?: \ W | \ W )*?) </Title [^>] *> ", regexoptions. multiline | regexoptions. ignorecase); match MC = reg. match (m_html); If (MC. success) m_title = MC. groups ["title"]. value. trim () ;}return m_title ;}} Public String m_html {get {If (m_html = NULL) {m_html = "" ;}return m_html ;}} /// <summary> /// this attribute obtains all the link information of this webpage, read-Only /// </Summary> public list <link> links {get {If (m_links.count = 0) getlinks (); Return m_links ;}} /// <Summary> // This attribute returns all plain text information of this webpage, read-Only // </Summary> Public String context {get {If (m_outstr = ") getcontext (int16.maxvalue); Return m_outstr ;}} /// <summary >/// obtain the size of the current page by using this attribute /// </Summary> Public int pagesize {get {return m_pagesize ;}} /// <summary> /// this attribute obtains all the intra-site links on this page. /// </Summary> public list <link> insitelinks {get {return getspeciallinksbyurl ("^ HTTP: // "+ m_uri.host, int16.maxv Alue) ;}/// <summary >/// this attribute indicates whether the webpage is available /// </Summary> Public bool isgood {get {return m_good ;}} /// <summary >/// this attribute indicates the website where the webpage is located /// </Summary> Public String host {get {return m_uri.host ;}} # endregion // <summary> /// analyze the link information from the HTML code /// </Summary> /// <returns> List <link> </returns> private list <link> getlinks () {If (m_links.count = 0) {RegEx [] RegEx = new RegEx [2]; RegEx [0] = new Reg Ex (@ "<A \ shref \ s * = ""(? <URL> [^ ""] *). *?> (? <Title> [^ <] *) </a> ", regexoptions. ignorecase | regexoptions. singleline); RegEx [1] = new RegEx ("<[I] * frame [^> <] + src = (\" | ')? (? <URL> ([^> \ "'\ s)]) +) (\" | ')? [^>] *> ", Regexoptions. ignorecase); For (INT I = 0; I <2; I ++) {match = RegEx [I]. match (m_html); While (match. success) {try {string url = httputility. urldecode (New uri (m_uri, match. groups ["url"]. value ). absoluteuri); string text = ""; if (I = 0) TEXT = new RegEx ("(<[^>] +>) | (\ s) | () | & | \ "", regexoptions. multiline | regexoptions. ignorecase ). replace (match. groups ["text"]. value, ""); Link link = New Link (); Link. TEXT = text; Link. navigateurl = URL; m_links.add (Link);} catch (exception ex) {console. writeline (ex. message) ;}; match = match. nextmatch () ;}}return m_links ;} /// <summary> /// this private method extracts a certain number of words of plain text from an HTML text. /// </Summary> /// <Param name = "instr "> HTML code </param> /// <Param name =" firstn "> Number of extracted words from the header </param> /// <Param name =" withlink "> whether to link the text in </param> /// <returns> plain text </returns> PR Ivate string getfirstnchar (string instr, int firstn, bool withlink) {If (m_outstr = "") {m_outstr = instr. clone () as string; m_outstr = new RegEx (@"(? M) <SCRIPT [^>] *> (\ w | \ W )*? </Script [^>] *> ", regexoptions. multiline | regexoptions. ignorecase). Replace (m_outstr," "); m_outstr = new RegEx (@"(? M) <style [^>] *> (\ w | \ W )*? </Style [^>] *> ", regexoptions. multiline | regexoptions. ignorecase). Replace (m_outstr," "); m_outstr = new RegEx (@"(? M) <select [^>] *> (\ w | \ W )*? </Select [^>] *> ", regexoptions. multiline | regexoptions. ignorecase). Replace (m_outstr," "); If (! Withlink) m_outstr = new RegEx (@"(? M) <A [^>] *> (\ w | \ W )*? </A [^>] *> ", regexoptions. multiline | regexoptions. ignorecase ). replace (m_outstr, ""); RegEx objreg = new system. text. regularexpressions. regEx ("(<[^>] +?>) | ", Regexoptions. multiline | regexoptions. ignorecase); m_outstr = objreg. replace (m_outstr, ""); RegEx objreg2 = new system. text. regularexpressions. regEx ("(\ s) +", regexoptions. multiline | regexoptions. ignorecase); m_outstr = objreg2.replace (m_outstr, "");} return m_outstr.length> firstn? M_outstr.substring (0, firstn): m_outstr ;}# Region Public syntax /// <summary> /// this public method extracts plain text of certain words in a webpage, including link text /// </Summary> /// <Param name = "firstn"> words </param> /// <returns> </returns> Public String getcontext (int firstn) {return getfirstnchar (m_html, firstn, true) ;}/// <summary> // This public method extracts a certain number of links from the links on this page, the URL of this link meets a regular expression // </Summary> // <Param name = "pattern"> Regular Expression </param> /// <Param name =" count "> Number of returned links Count </param> /// <returns> List <link> </returns> public list <link> getspeciallinksbyurl (string pattern, int count) {If (m_links.count = 0) getlinks (); List <link> speciallinks = new list <link> (); List <link>. enumerator I; I = m_links.getenumerator (); int CNT = 0; while (I. movenext () & CNT <count) {If (New RegEx (pattern, regexoptions. multiline | regexoptions. ignorecase ). match (I. current. navigateurl ). s Uccess) {speciallinks. add (I. current); CNT ++ ;}}return speciallinks ;}/// <summary >/// this public method extracts a certain number of links from the links on this page, the text of this link meets a regular expression // </Summary> /// <Param name = "pattern"> Regular Expression </param> /// <Param name =" count "> Number of returned Links </param> // <returns> List <link> </returns> public list <link> getspeciallinksbytext (string pattern, int count) {If (m_links.count = 0) getlinks (); List <link> speciallinks = new list <link> (); List <link>. enumerator I; I = m_links.getenumerator (); int CNT = 0; while (I. movenext () & CNT <count) {If (New RegEx (pattern, regexoptions. multiline | regexoptions. ignorecase ). match (I. current. text ). success) {speciallinks. add (I. current); CNT ++;} return speciallinks ;} /// <summary> /// this public method extracts text that satisfies a regular expression in the plain text of this webpage. /// </Summary> /// <Param name =" pattern "> Regular Expression </param> // <returns> return text </returns> Public String getspecialwords (string pattern) {If (m_outstr = "") getcontext (int16.maxvalue); RegEx = new RegEx (pattern, regexoptions. multiline | regexoptions. ignorecase); match MC = RegEx. match (m_outstr); If (MC. success) return MC. groups [1]. value; return string. empty ;}# endregion # region constructor private void Init (string _ URL) {try {m_uri = new uri (_ URL); m_links = new list <link> (); m_ht ML = ""; m_outstr = ""; m_title = ""; m_good = true; If (_ URL. endswith (". RAR ") | _ URL. endswith (". dat ") | _ URL. endswith (". msi ") {m_good = false; return;} httpwebrequest rqst = (httpwebrequest) webrequest. create (m_uri); rqst. allowautoredirect = true; rqst. maximumautomaticredirections = 3; rqst. useragent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"; rqst. keepalive = true; rqst. timeout = 10000; lock (webpage. webcookies) {If (webpage. webcookies. containskey (m_uri.host) rqst. cookiecontainer = webpage. webcookies [m_uri.host]; else {cookiecontainer cc = new cookiecontainer (); webpage. webcookies [m_uri.host] = cc; rqst. cookiecontainer = Cc ;}} httpwebresponse RSPs = (httpwebresponse) rqst. getresponse (); stream Sm = RSPs. getresponsestream (); If (! RSPs. contenttype. tolower (). startswith ("text/") | RSPs. contentlength> 1 <22) {RSPs. close (); m_good = false; return;} encoding cding = system. text. encoding. default; string contenttype = RSPs. contenttype. tolower (); int IX = contenttype. indexof ("charset ="); If (Ix! =-1) {try {cding = system. text. encoding. getencoding (RSPs. contenttype. substring (IX + "charset ". length + 1);} catch {cding = encoding. default;} // The decoded result depends on the actual situation. // m_html = httputility. htmldecode (New streamreader (SM, cding ). readtoend (); m_html = new streamreader (SM, cding ). readtoend ();} else {// The location may need to be decoded based on the actual situation. // m_html = httputility. htmldecode (New streamreader (SM, cding ). readtoend (); m_html = New streamreader (SM, cding). readtoend (); RegEx = new RegEx ("charset = (? <Cding> [^ =] + )? \ "", Regexoptions. ignorecase); string strcding = RegEx. match (m_html ). groups ["cding"]. value; try {cding = encoding. getencoding (strcding);} catch {cding = encoding. default;} byte [] bytes = encoding. default. getbytes (m_html.tochararray (); m_html = cding. getstring (bytes); If (m_html.split ('? '). Length> 100) {m_html = encoding. default. getstring (bytes) ;}}m_pagesize = m_html.length; m_uri = RSPs. responseuri; RSPs. close () ;}catch (exception ex) {}} public webpage (string _ URL) {string uurl = ""; try {uurl = Uri. unescapedatastring (_ URL); _ url = uurl;} catch {}; Init (_ URL) ;}# endregion}

Call

Webpage webinfo = new webpage ("url"); webinfo. Context; // all content without HTML tags webinfo. m_html; // content containing HTML tags... reference attributes

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.