ASP. NET uses regular and network programming to crawl Web page data (useful)
ASP. NET uses regular and network programming to crawl Web page data (useful)
<summary>///Crawl Web content///</summary>//<param name= "strURL" > Collection address </PARAM&G T <param name= "Begin" > Start character </param>//<param name= "End" > End character </param>//<retu rns></returns> private static string GetContent (string strurl, String Begin, String End) { String result = String.Empty; HttpWebRequest request = (HttpWebRequest) webrequest.create (strURL); HttpWebResponse response = (HttpWebResponse) request. GetResponse (); using (StreamReader reader = new StreamReader (response. GetResponseStream (), System.Text.Encoding.Default)) {result = reader. ReadToEnd (); Reader. Close (); Response. Close (); }//crawl content Match table = Regex.match (result, "(? <=" + Begin + ") [\\s\\s]*?(?
= "+ End +") ", regexoptions.ignorecase); result = nohtml (table. Value); return result; }///<summary>///Remove HTML tag///</summary>///<param name= "nohtml" > Source generation containing HTML Code </param>///<returns> has been removed after the text </returns> private static string nohtml (String htmlstring {//delete script htmlstring = Regex.Replace (htmlstring, @ "<script[^>]*?)
>.*?
</script> "," ", regexoptions.ignorecase); Delete html htmlstring = Regex.Replace (htmlstring, @ "< (. [ ^>]*) > "," ", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "([\ r \ n]) [\s]+", "", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "-and", "", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "<!--. *", "", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "& (quot| #34);", "\" ", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "& (amp| #38);", "&", Regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "& (lt| #60);", "<", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "& (gt| #62);", ">", Regexoptions.ignorecase); htmlstring = ReGex. Replace (htmlstring, @ "& (nbsp| #160);", "", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "& (iexcl| #161);", "\xa1", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "& (cent| #162);", "\xa2", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "& (pound| #163);", "\xa3", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "& (copy| #169);", "\xa9", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, @ "(\d+);", "", regexoptions.ignorecase); htmlstring = Regex.Replace (htmlstring, ">", ""); htmlstring = Regex.Replace (htmlstring, "<", ""); htmlstring = Regex.Replace (htmlstring, "\ r \ n", ""); htmlstring = htmlstring.substring (Htmlstring.indexof ("\ n") + 1); if (Htmlstring.lastindexof ("'")>= 0) htmlstring = htmlstring.substring (Htmlstring.lastindexof ("'") + 1); if (Htmlstring.indexof ("class= ' TDBK '") >= 0) htmlstring = htmlstring.substring (Htmlstring.indexof ("Class = ' TDBK ') + "class= ' TDBK '". Length); return htmlstring; }
ASP. NET uses regular and network programming to crawl Web page data (useful)