C # web crawler,
The company editor needs to crawl the webpage content and asked me to help with a simple crawling tool.
This is the crawling of webpage content. For example, this is not uncommon for everyone, but there are some minor changes here and the code is presented for your reference.
1 private string GetHttpWebRequest(string url) 2 { 3 HttpWebResponse result; 4 string strHTML = string.Empty; 5 try 6 { 7 Uri uri = new Uri(url); 8 WebRequest webReq = WebRequest.Create(uri); 9 WebResponse webRes = webReq.GetResponse(); 10 11 HttpWebRequest myReq = (HttpWebRequest)webReq; 12 myReq.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705"; 13 myReq.Accept = "*/*"; 14 myReq.KeepAlive = true; 15 myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); 16 result = (HttpWebResponse)myReq.GetResponse(); 17 Stream receviceStream = result.GetResponseStream(); 18 StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("utf-8")); 19 strHTML = readerOfStream.ReadToEnd(); 20 readerOfStream.Close(); 21 receviceStream.Close(); 22 result.Close(); 23 } 24 catch 25 { 26 Uri uri = new Uri(url); 27 WebRequest webReq = WebRequest.Create(uri); 28 HttpWebRequest myReq = (HttpWebRequest)webReq; 29 myReq.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705"; 30 myReq.Accept = "*/*"; 31 myReq.KeepAlive = true; 32 myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); 33 //result = (HttpWebResponse)myReq.GetResponse(); 34 try 35 { 36 result = (HttpWebResponse)myReq.GetResponse(); 37 } 38 catch (WebException ex) 39 { 40 result = (HttpWebResponse)ex.Response; 41 } 42 Stream receviceStream = result.GetResponseStream(); 43 StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("gb2312")); 44 strHTML = readerOfStream.ReadToEnd(); 45 readerOfStream.Close(); 46 receviceStream.Close(); 47 result.Close(); 48 } 49 return strHTML; 50 }
Is this a way to crawl a webpage based on a url? There are some minor changes. Many webpages have different encoding formats, and even some websites have anti-crawling measures. This method can also be crawled after modification.
Here are all the URLs on the web page.
/// <Summary> /// extract the URL in the HTML code /// </summary> /// <param name = "htmlCode"> </param> // <returns> </returns> private static List <string> GetHyperLinks (string htmlCode, string url) {ArrayList al = new ArrayList (); bool IsGenxin = false; StringBuilder weburlSB = new StringBuilder (); // SQL StringBuilder linkSb = new StringBuilder (); // display data List <string> Weburllistzx = new List <string> (); // Add List <string> W Eburllist = new List <string> (); // old string ProductionContent = htmlCode; Regex reg = new Regex (@ "http (s )?: // ([\ W-] + \.) + [\ w-] + /? "); String wangzhanyuming = reg. match (url, 0 ). value; MatchCollection mc = Regex. matches (ProductionContent. replace ("href = \"/"," href = \ "" + wangzhanyuming ). replace ("href = '/", "href ='" + wangzhanyuming ). replace ("href =/", "href =" + wangzhanyuming ). replace ("href = \". /"," href = \ "" + wangzhanyuming), @ "<[aA] [^>] * href = [^>] *>", RegexOptions. singleline); int Index = 1; foreach (Match m in mc) {MatchCollecti On mc1 = Regex. matches (m. value, @ "[a-zA-z] +: // [^ \ s] *", RegexOptions. singleline); if (mc1.Count> 0) {foreach (Match m1 in mc1) {string linkurlstr = string. empty; linkurlstr = m1.Value. replace ("\"",""). replace ("'",""). replace ("> ",""). replace (";", ""); weburlSB. append ("$-$"); weburlSB. append (linkurlstr); weburlSB. append ("$ _ $"); if (! Weburllist. Contains (linkurlstr )&&! Weburllistzx. contains (linkurlstr) {IsGenxin = true; Weburllistzx. add (linkurlstr); linkSb. appendFormat ("{0} <br/>", linkurlstr) ;}} else {if (m. value. indexOf ("javascript") =-1) {string amstr = string. empty; string wangzhanxiangduilujin = string. empty; wangzhanxiangduilujin = url. substring (0, url. lastIndexOf ("/") + 1); amstr = m. value. replace ("href = \" "," href = \ "" + wangzhanxiangduilujin ). repl Ace ("href = '", "href ='" + wangzhanxiangduilujin); MatchCollection mc11 = Regex. matches (amstr, @ "[a-zA-z] +: // [^ \ s] *", RegexOptions. singleline); foreach (Match m1 in mc11) {string linkurlstr = string. empty; linkurlstr = m1.Value. replace ("\"",""). replace ("'",""). replace ("> ",""). replace (";", ""); weburlSB. append ("$-$"); weburlSB. append (linkurlstr); weburlSB. append ("$ _ $"); if (! Weburllist. Contains (linkurlstr )&&! Weburllistzx. contains (linkurlstr) {IsGenxin = true; Weburllistzx. add (linkurlstr); linkSb. appendFormat ("{0} <br/>", linkurlstr) ;}}} Index ++;} return Weburllistzx ;}
This technology simply uses regular expressions for matching! Next we will present the method for obtaining titles and storing them in xml files.
1 /// <summary> 2 //// write the URL to the xml file 3 /// </summary> 4 /// <param name = "strURL"> </ param> 5 // <param name = "alHyperLinks"> </param> 6 private static void WriteToXml (string strURL, list <string> alHyperLinks) 7 {8 XmlTextWriter writer = new XmlTextWriter (@ "D: \ HyperLinks. xml ", Encoding. UTF8); 9 writer. formatting = Formatting. indented; 10 writer. writeStartDocument (false); 11 writer. writeDocType ("HyperL Inks ", null," urls. dtd ", null); 12 writer. writeComment ("extracted from" + strURL + "HYPERLINK"); 13 writer. writeStartElement ("HyperLinks"); 14 writer. writeStartElement ("HyperLinks", null); 15 writer. writeAttributeString ("DateTime", DateTime. now. toString (); 16 foreach (string str in alHyperLinks) 17 {18 string title = GetDomain (str); 19 string body = str; 20 writer. writeElementString (title, null, body); 21} 22 Writer. writeEndElement (); 23 writer. writeEndElement (); 24 writer. flush (); 25 writer. close (); 26} 27 /// <summary> 28 // obtain the domain name suffix 29 /// </summary> 30 /// <param name = "strURL"> </ param> 31 // <returns> </returns> 32 private static string GetDomain (string strURL) 33 {34 string retVal; 35 string strRegex = @"(\. com/| \. net/| \. cn/| \. org/| \. gov/) "; 36 Regex r = new Regex (strRegex, RegexOptions. ignore Case); 37 Match m = r. match (strURL); 38 retVal = m. toString (); 39 strRegex = @"\. |/$ "; 40 retVal = Regex. replace (retVal, strRegex ,""). toString (); 41 if (retVal = "") 42 retVal = "other"; 43 return retVal; 44} 45 // <summary> 46 // obtain the title 47 // </summary> 48 // <param name = "html"> </param> 49 /// <returns> </returns> 50 private static string GetTitle (string html) 51 {52 string titleFilter = @ "< Title> [\ s \ S] *? </Title> "; 53 string h1Filter = @"
This is all the methods used, and there are still many improvements to be made! If you find any deficiencies, please point them out. Thank you!
Source code to: http://download.csdn.net/detail/nightmareyan/9584256