Recently, in the area of ship-only identification, a large number of positive samples are needed to train the AdaBoost classifier.
So to marinetraffic this site to download the ship only pictures. It is obviously very convenient to write a crawler to download its own initiative.
Site features
Before introducing the crawler, let's look at some of the features of the Marinetraffic site:
1. Reptilian behavior is regularly detected. Suppose you think there's a lot of crawlers downloading pictures.
The connection will be added to the blacklist, no way to download the next few days.
2. The ship only has a large difference in picture resources. Some ships only have more than 1000 pictures, some of them just don't have a picture, we need a lot of ships just a lot of pictures. So you need to sort the downloaded ships just by priority.
3. The positive sample used to train the classifier requires the same resolution as the detected object. and the marinetraffic site download picture can be set under the width of the picture, the site according to the aspect ratio, to generate the corresponding height. So. Different picture heights are not the same. Need to deal with them later.
How to Solve
- For crawler detection. Set a random wait time of about 10s. Able to bypass site crawler behavior detection.
- On the ship only in accordance with the image proficiency, first download the number of pictures, and each ship just do not download too much. Ensure the difference of the picture. Like what
- Use a uniform width when downloading.
Post-processing removes the same resolution from the picture as the ship simply
Crawler source Code
usingSystem;usingSystem.Collections.Generic;usingSystem.Globalization;usingSystem.IO;usingSystem.Linq;usingSystem.Net;usingSystem.Runtime.Serialization.Formatters.Binary;usingSystem.Text;usingSystem.Text.RegularExpressions;usingSystem.Threading;usingSystem.threading.tasks;namespace Ship only image crawler {class Program {Static voidDownload_all_shipid (list<string> Shipid_list) {Try{WebClient mywebclient =NewWebClient (); mywebclient.headers["User-agent"] ="blah"; Mywebclient.credentials = CredentialCache.DefaultCredentials;//Gets or sets the network credentials that are used to authenticate requests to Internet resources; //console.writeline ("Here1"); //http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/ //http://www.marinetraffic.com/en/ais/index/ships/all //http://www.marinetraffic.com/ais/index/ships/all/page:2/sort:count_photos/direction:desc; for(intPagenum =1; Pagenum < -; pagenum++) {Console.WriteLine ("Start analysis of the first"+ Pagenum +"Page"); Mywebclient.credentials = CredentialCache.DefaultCredentials;//Gets or sets the network credentials that are used to authenticate requests to Internet resources;mywebclient.headers["User-agent"] ="blah";Try{//console.writeline ("here0");byte[] Pagedata = Mywebclient.downloaddata (@ "Http://www.marinetraffic.com/en/ais/index/ships/all/page:"+ Pagenum +"/sort:count_photos/direction:desc/per_page:50");//Download data from a specified site //pagehtml = Encoding.Default.GetString (pagedata); Suppose to get the site page used is GB2312, then use this sentence; stringpagehtml = Encoding.UTF8.GetString (pagedata);//Assume that UTF-8 is used to obtain a site page. Then use this sentence; //console.writeline (pagehtml);//Enter the content obtained in the console; //console.writeline ("Here1"); intUrlindex =-1;stringOrg_label ="Shipid:"; Urlindex = Pagehtml.indexof (Org_label, Urlindex +1); while(Urlindex! =-1) {intEndofurl = Pagehtml.indexof ("/", Urlindex + Org_label. Length);//console.writeline ("Here2"); stringShipid = pagehtml.substring (Urlindex + Org_label. Length, Endofurl-urlindex-org_label. Length);if(!shipid_list. Contains (Shipid)) {Console.WriteLine ("New ID:"+ shipid); Shipid_list. ADD (SHIPID); }//console.writeline ("Existing ID:" + shipid);Urlindex = Pagehtml.indexof (Org_label, Urlindex +1); }/// save Web page //using (StreamWriter sw = new StreamWriter ("ouput.html")//writes the acquired content to the text //{ //SW. Write (pagehtml); //}Console.WriteLine ("Complete "+ Pagenum +"Page Analysis"); }Catch(WebException webEx) {Console.WriteLine (webEx.Message.ToString ()); }//The following is a random number method guaranteed to be downloaded after 10 seconds. To bypass the violation check. Console.Write ("Bypass site crawler behavior detection ..."); Random rd =NewRandom ();inttime_sleep = Rd. Next ()%Ten+Ten; Thread.Sleep (Time_sleep * +); Console.WriteLine (); } Console.WriteLine ("End of Analysis");//The list contents are saved in the file, using the method of serialization; stringFile =@ "C:\Users\dragonfive\Desktop\ crawler get ship only picture \ Third batch \0_100page_shipid.txt";using(FileStream Fswriter =NewFileStream (file, FileMode.OpenOrCreate, FileAccess.Write)) {The Stu is serialized with//below. BinaryFormatter BF =NewBinaryFormatter (); Bf. Serialize (Fswriter, shipid_list); } }Catch(WebException webEx) {Console.WriteLine (webEx.Message.ToString ()); } }// <summary> /// According to the obtained ship_id get all the pictures of the ship_id; // </summary> /// <param name= "ship_id" ></param> Static voidDownload_jpg (stringSHIP_ID) {Try{Console.WriteLine ("Start downloading Shipid as:"+ship_id+"The picture"); WebClient mywebclient =NewWebClient (); Mywebclient.credentials = CredentialCache.DefaultCredentials;//Gets or sets the network credentials that are used to authenticate requests to Internet resourcesmywebclient.headers["User-agent"] ="blah";//http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/ //http://www.marinetraffic.com/en/photos/of/ships/shipid:371668/per_page:1000/page:1byte[] Pagedata = Mywebclient.downloaddata (@ "Http://www.marinetraffic.com/en/photos/of/ships/shipid:"+ ship_id +@ "/per_page:100/page:1");//Download data from a specified site //string pagehtml = Encoding.Default.GetString (pagedata); Assume that the Fetch site page is GB2312. Then use this sentence stringpagehtml = Encoding.UTF8.GetString (pagedata);//Assume that UTF-8 is used to obtain a site page, use this sentence //console.writeline (pagehtml);//Enter what you get in the consoleConsole.WriteLine ("Meta page has been downloaded");//using (StreamWriter sw = new StreamWriter ("ouput.html")//writes the acquired content to the text //{ //SW. Write (pagehtml); //} intUrlindex =-1;stringOrg_label ="Data-original= '"; Urlindex = Pagehtml.indexof (Org_label, Urlindex +1);inti =0;//directory.createdirectory (@ "./"); while(Urlindex! =-1) {intEndofurl = Pagehtml.indexof ("'", Urlindex + Org_label. Length);stringurl = pagehtml.substring (Urlindex + Org_label. Length, Endofurl-urlindex-org_label. Length); ///The following is the way Unicode encoding is converted to string; //matchcollection mc = regex.matches (StrName, @ "\\u ([\w]{2}) ([\w]{2})", RegexOptions.Compiled | Regexoptions.ignorecase); //byte[] BTS = new byte[2]; //foreach (Match m in MC) //{ //Bts[0] = (byte) int. Parse (M.groups[2]. Value, numberstyles.hexnumber); //bts[1] = (byte) int. Parse (M.groups[1]. Value, numberstyles.hexnumber); //Musicname + = Encoding.Unicode.GetString (BTS); //} //console.writeline ("The next download is:" + musicname); //The following is a random number method guaranteed to be downloaded after 10 seconds. To bypass the violation check. Console.Write ("Bypass site crawler behavior detection ..."); Random rd =NewRandom ();inttime_sleep = Rd. Next ()%Ten+Ten; Thread.Sleep (Time_sleep * +); Console.WriteLine ();Try{//This is the command to download;Console.WriteLine (URL); Mywebclient.credentials = CredentialCache.DefaultCredentials;//Gets or sets the network credentials that are used to authenticate requests to Internet resourcesmywebclient.headers["User-agent"] ="blah"; byte[] jpgdata = mywebclient.downloaddata (URL);//Download data from the specified Web page; //Save the downloaded content in one place; using(FileStream fs =NewFileStream (@ "C:\Users\dragonfive\Desktop\ crawler get ship just picture \ Third batch \"+ ship_id +"_"+ i +". jpg", FileMode.OpenOrCreate, FileAccess.Write)) {fs. Write (Jpgdata,0, Jpgdata. Length); } }Catch(WebException webEx) {Console.WriteLine ("Was it captured?"); Console.WriteLine (WebEx.Message.ToString ()); } Console.WriteLine ("Successful download of section"+ (i + +) +"Picture"); Urlindex = Pagehtml.indexof (Org_label, Urlindex +1); }/// save Web page //using (StreamWriter sw = new StreamWriter ("ouput.html")//writes the acquired content to the text //{ //SW. Write (pagehtml); //}Console.WriteLine ("*****************************************"); Console.WriteLine ("Download"+i+"Zhang ship_id for"+ship_id+"The picture"); Console.WriteLine ("*****************************************");//console.readline ();//Let the console pause, or it will flash over.}Catch(WebException webEx) {Console.WriteLine (webEx.Message.ToString ()); } }Static voidMain (string[] args) {list<string> shipid_list =Newlist<string> ();//shipid_list. ADD ("371681");//temporary high-speed generation of pictures with this;Download_all_shipid (shipid_list);//string file = @ "C:\Users\dragonfive\Desktop\ crawler obtains ship only picture \ third Batch \0_100page_shipid.txt"; //using (FileStream fsreader = new FileStream (file, FileMode.Open, FileAccess.Read)) //{ ///// following the deserialization session; //BinaryFormatter BF = new BinaryFormatter (); //Shipid_list = (list<string>) bf. Deserialize (fsreader); //Console.WriteLine ("Load successfully" + shipid_list. Count + "one Shipid"); //} /// /371652 371668 371681 1252401 //shipid_list. Remove ("371652"); //shipid_list. Remove ("371668"); //shipid_list. Remove ("371681"); //shipid_list. Remove ("1252401"); // /132264 //shipid_list. Remove ("371077"); //shipid_list. Remove ("132264"); //shipid_list. Remove ("224871"); //shipid_list. Remove ("279923"); //shipid_list. Remove ("369163"); //shipid_list. Remove ("266342"); //shipid_list. Remove ("371216"); //shipid_list. Remove ("368174"); //shipid_list. Remove ("369163"); foreach(varship_idinchShipid_list) {download_jpg (ship_id); } console.readline ();//Let the console pause, or flash past} }}
Use C # to write crawlers in marinetraffic download ship only pictures