The original C # production of multi-threaded processing enhanced version of the web crawler
Last time did a help the company sister did a reptile, not very delicate, this company project to use, so have done some modification, the function added the URL image collection, download, threading Interface URL Image download.
Talk about the Idea: the prime minister gets all the contents of the initial URL at the initial URL to capture the image to the initial URL collection link to put the collected links into the queue to continue to collect pictures, and then continue to collect links, infinite loop
Let's take a look at the picture:
Processing Web page content crawl and web site crawl have made improvements, the following is still everyone to look at the code, there are shortcomings, also please place!
Web content Crawl htmlcoderequest,
Web page URL crawl gethttplinks, use regular to filter links in HTML
Image capture gethtmlimageurllist, use regular to filter the IMG in HTML
are written in a package class inside Httphelper
/// <summary> ///gets the URL of all the pictures in the Html. /// </summary> /// <param name= "shtmltext" >HTML code</param> /// <returns>List of URLs for pictures</returns> public Static stringHtmlcoderequest (stringUrl) { if(string. IsNullOrEmpty (Url)) {return ""; } Try { //Create a requestHttpWebRequest Httprequst =(httpwebrequest) webrequest.create (Url); //do not establish persistent linksHttprequst. KeepAlive =true; //to set the method of the requestHttprequst. Method ="GET"; //Set Header valueHttprequst. UserAgent ="user-agent:mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2;. NET CLR 1.0.3705"; Httprequst. Accept="*/*"; Httprequst. Headers.add ("Accept-language","zh-cn,en-us;q=0.5"); Httprequst. Servicepoint.expect100continue=false; Httprequst. Timeout= the; Httprequst. AllowAutoRedirect=true;//whether to allow 302Servicepointmanager.defaultconnectionlimit = -; //Get ResponseHttpWebResponse Webres =(httpwebresponse) Httprequst. GetResponse (); //gets the text stream of the response stringContent =string. Empty; using(System.IO.Stream Stream =Webres.getresponsestream ()) { using(System.IO.StreamReader reader =NewStreamReader (stream, System.Text.Encoding.GetEncoding ("Utf-8")) {content=Reader. ReadToEnd (); } } //Cancel RequestHttprequst. Abort (); //Return Data content returncontent; } Catch(Exception) {return ""; } }/// <summary> ///Extract Page Links/// </summary> /// <param name= "html" ></param> /// <returns></returns> public Staticlist<string> Gethtmlimageurllist (stringUrl) { stringHTML =httphelper.htmlcoderequest (url); if(string. IsNullOrEmpty (html)) {return Newlist<string>(); } //define a regular expression to match an IMG tagRegex regimg =NewRegex (@"]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*["'"? [\s\t\r\n]* (? [^\s\t\r\n "" ' <>]* ") [^<>]*?/? [\s\t\r\n]*>", regexoptions.ignorecase); //search for matching stringsMatchCollection matches =regimg.matches (html); List<string> surllist =Newlist<string>(); //get a list of matches foreach(match matchinchmatches) Surllist.add (match. groups["Imgurl"]. Value); returnsurllist; } /// <summary> ///Extract Page Links/// </summary> /// <param name= "html" ></param> /// <returns></returns> public Staticlist<string> Gethttplinks (stringUrl) { //Get URL content stringHTML =httphelper.htmlcoderequest (url); if(string. IsNullOrEmpty (html)) {return Newlist<string>(); } //Matching HTTP links Const stringPATTERN2 =@"http (s)?:/ /([\w-]+\.) +[\w-]+ (/[\w-./?%&=]*)?"; Regex R2=NewRegex (pattern2, regexoptions.ignorecase); //Get matching resultsMatchCollection m2 =r2. Matches (html); List<string> links =Newlist<string>(); foreach(Match Url2inchM2) { if(stringhelper.checkurlislegal (url2. ToString ()) | | ! Stringhelper.ispureurl (url2. ToString ()) | |links. Contains (url2. ToString ()))Continue; Links. ADD (url2. ToString ()); } //match the link inside the href Const stringPattern =@"(? i) <a\s[^>]*?href= ([' ""]?) (?! Javascript|__dopostback) (? <url>[^ ' "" \s*#<>]+) [^>]*>"; ; Regex R=NewRegex (pattern, regexoptions.ignorecase); //Get matching resultsMatchCollection m =r.matches (html); foreach(Match URL1inchM) {stringHREF1 = url1. groups["URL"]. Value; if(!href1. Contains ("http") ) {href1= Global.weburl +href1; } if(! Stringhelper.ispureurl (href1) | | Links. Contains (href1))Continue; Links. ADD (href1); } returnlinks; }
Download the picture here there is a limit of the number of task bars, the limit is 200. If the thread waits 5 seconds, The download image here is the delegate of the asynchronous call
public stringDownloadimg (stringUrl) { if(!string. IsNullOrEmpty (url)) {Try { if(!url. Contains ("http") ) {url= Global.weburl +url; } HttpWebRequest Request=(httpwebrequest) webrequest.create (url); Request. Timeout= -; Request. UserAgent="user-agent:mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2;. NET CLR 1.0.3705"; //whether to allow 302Request. AllowAutoRedirect =true; WebResponse response=Request. GetResponse (); Stream Reader=Response. GetResponseStream (); //file name stringAfirstname =Guid.NewGuid (). ToString (); //Extended Name stringAlastname = Url. Substring (url. LastIndexOf (".") +1, (url. Length-url. LastIndexOf (".") -1)); FileStream writer=NewFileStream (global.floderurl + afirstname +"."+alastname, filemode.openorcreate, fileaccess.write); byte[] buff =New byte[ +]; //the number of bytes actually read intc =0; while(c = Reader.) Read (buff,0, buff. Length)) >0) {writer. Write (buff,0, c); } writer. Close (); Writer. Dispose (); Reader. Close (); Reader. Dispose (); Response. Close (); return(afirstname +"."+alastname); } Catch(Exception) {return "error: Address"+url; } } return "error: address is empty"; }
Words do not say more, we need to improve their own!
C # multithreaded web crawler