用c#編寫爬蟲在marinetraffic下載船僅僅圖片

最後更新：2017-06-04 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

標籤：line original 方式 pid bfd put lines binary des

近期在做船僅僅識別方面的事情，須要大量的正樣本來訓練adaboost分類器。

於是到marinetraffic這個網站上下載船僅僅圖片。寫個爬蟲來自己主動下載顯然非常方便。

網站特點

在介紹爬蟲之前首先瞭解一下marinetraffic這個網站的一些特點：
1. 會定期檢測爬蟲行為。假設覺得有爬蟲大量下載圖片。

會把該串連增加黑名單，後幾天都沒辦法下載。
2. 船僅僅圖片資源差異大。有的船僅僅有1000多張圖，有的船僅僅沒有一張圖，我們須要的是非常多船僅僅的非常多張圖。所以須要對下載的船僅僅按優先順序排序。
3. 用來訓練分類器的正樣本要求檢測對象的解析度一樣。而marinetraffic網站下載的圖片能夠設定下在的圖片的寬度，網站依據長寬比，產生對應的高度。所以。不同圖片高度不一樣。須要自己後期處理。

解決方式

針對爬蟲檢測。設定一個隨機等待時間，10s左右。能夠繞過網站爬蟲行為檢測。
對船僅僅依照圖片熟練排序，先下載圖片數量多的，而且每一個船僅僅不用下載太多。保證圖片的差異性。比如
在下載的時候使用統一的寬度。
後期處理從圖片中摳出解析度一樣的船僅僅

爬蟲原始碼

using System;using System.Collections.Generic;using System.Globalization;using System.IO;using System.Linq;using System.Net;using System.Runtime.Serialization.Formatters.Binary;using System.Text;using System.Text.RegularExpressions;using System.Threading;using System.Threading.Tasks;namespace 船僅僅映像爬蟲{    class Program    {        static void download_all_shipid(List<string> shipid_list)        {            try            {                WebClient MyWebClient = new WebClient();                MyWebClient.Headers["User-Agent"] = "blah";                MyWebClient.Credentials = CredentialCache.DefaultCredentials;//擷取或設定用於向Internet資源的請求進行身分識別驗證的網路憑據;                //Console.WriteLine("here1");                //http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/                //http://www.marinetraffic.com/en/ais/index/ships/all                //http://www.marinetraffic.com/ais/index/ships/all/page:2/sort:COUNT_PHOTOS/direction:desc;                for (int pageNum = 1; pageNum < 100; pageNum++)                {                    Console.WriteLine("開始分析第" + pageNum + "張網頁");                    MyWebClient.Credentials = CredentialCache.DefaultCredentials;//擷取或設定用於向Internet資源的請求進行身分識別驗證的網路憑據;                    MyWebClient.Headers["User-Agent"] = "blah";                    try                    {                        //Console.WriteLine("here0");                        Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/ais/index/ships/all/page:" + pageNum + "/sort:COUNT_PHOTOS/direction:desc/per_page:50"); //從指定網站下載資料                        //pageHtml = Encoding.Default.GetString(pageData);  //假設擷取網站頁面採用的是GB2312，則使用這句;                                    string pageHtml = Encoding.UTF8.GetString(pageData); //假設擷取網站頁面採用的是UTF-8。則使用這句;                        //Console.WriteLine(pageHtml);//在控制台輸入擷取的內容;                        //Console.WriteLine("here1");                        int urlindex = -1;                        string org_label = "shipid:";                        urlindex = pageHtml.IndexOf(org_label, urlindex + 1);                        while (urlindex != -1)                        {                            int endOfUrl = pageHtml.IndexOf("/", urlindex + org_label.Length);                            //Console.WriteLine("here2");                            string shipid = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);                            if (!shipid_list.Contains(shipid))                            {                                Console.WriteLine("新增id:" + shipid);                                shipid_list.Add(shipid);                            }                            //Console.WriteLine("已有id:" + shipid);                            urlindex = pageHtml.IndexOf(org_label, urlindex + 1);                        }                        ///儲存網頁                        //using (StreamWriter sw = new StreamWriter("ouput.html"))//將擷取的內容寫入文本                        //{                        //    sw.Write(pageHtml);                        //}                        Console.WriteLine("完畢第" + pageNum + "頁分析");                    }                    catch (WebException webEx)                    {                        Console.WriteLine(webEx.Message.ToString());                    }                    //以下是一個隨機數的方法保證10秒後再下載。以繞過違規檢測。                    Console.Write("繞開網站爬蟲行為檢測中......");                    Random rd = new Random();                    int time_sleep = rd.Next() % 10 + 10;                    Thread.Sleep(time_sleep * 1000);                    Console.WriteLine();                }                Console.WriteLine("分析結束");                //以下把list內容儲存進檔案,使用序列化的方法;                string file = @"C:\Users\dragonfive\Desktop\爬蟲獲得船僅僅圖片\第三批\0_100page_shipid.txt";                using (FileStream fsWriter = new FileStream(file, FileMode.OpenOrCreate, FileAccess.Write))                {                    //以下對stu進行序列化。                    BinaryFormatter bf = new BinaryFormatter();                    bf.Serialize(fsWriter, shipid_list);                }            }            catch (WebException webEx)            {                Console.WriteLine(webEx.Message.ToString());            }        }        /// <summary>        /// 依據得到的ship_id獲得該ship_id的全部圖片;        /// </summary>        /// <param name="ship_id"></param>        static void download_jpg(string ship_id)        {            try            {                Console.WriteLine("開始下載shipid為："+ship_id+"的圖片");                WebClient MyWebClient = new WebClient();                MyWebClient.Credentials = CredentialCache.DefaultCredentials;//擷取或設定用於向Internet資源的請求進行身分識別驗證的網路憑據                MyWebClient.Headers["User-Agent"] = "blah";                //http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/                //http://www.marinetraffic.com/en/photos/of/ships/shipid:371668/per_page:1000/page:1                Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/photos/of/ships/shipid:" + ship_id + @"/per_page:100/page:1"); //從指定網站下載資料                //string pageHtml = Encoding.Default.GetString(pageData);  //假設擷取網站頁面採用的是GB2312。則使用這句                            string pageHtml = Encoding.UTF8.GetString(pageData); //假設擷取網站頁面採用的是UTF-8，則使用這句                //Console.WriteLine(pageHtml);//在控制台輸入擷取的內容                Console.WriteLine("元網頁已下載");                //using (StreamWriter sw = new StreamWriter("ouput.html"))//將擷取的內容寫入文本                //{                //    sw.Write(pageHtml);                //}                int urlindex = -1;                string org_label = "data-original=‘";                urlindex = pageHtml.IndexOf(org_label, urlindex + 1);                int i = 0;                //Directory.CreateDirectory(@"./" );                while (urlindex != -1)                {                    int endOfUrl = pageHtml.IndexOf("‘", urlindex + org_label.Length);                    string url = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);                    ////以下是unicode編碼轉換為string的方式;                    //MatchCollection mc = Regex.Matches(strName, @"\\u([\w]{2})([\w]{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase);                    //byte[] bts = new byte[2];                    //foreach (Match m in mc)                    //{                    //    bts[0] = (byte)int.Parse(m.Groups[2].Value, NumberStyles.HexNumber);                    //    bts[1] = (byte)int.Parse(m.Groups[1].Value, NumberStyles.HexNumber);                    //    musicName += Encoding.Unicode.GetString(bts);                    //}                    //Console.WriteLine("接下來下載的是:" + musicName);                    //以下是一個隨機數的方法保證10秒後再下載。以繞過違規檢測。                    Console.Write("繞過網站爬蟲行為檢測中......");                    Random rd = new Random();                    int time_sleep = rd.Next() % 10 + 10;                    Thread.Sleep(time_sleep * 1000);                    Console.WriteLine();                    try                    {                        //這是下載的命令；                        Console.WriteLine(url);                        MyWebClient.Credentials = CredentialCache.DefaultCredentials;//擷取或設定用於向Internet資源的請求進行身分識別驗證的網路憑據                        MyWebClient.Headers["User-Agent"] = "blah";                        Byte[] jpgdata = MyWebClient.DownloadData(url); //從指定網頁下載資料;                        //把下載的內容儲存在一個地方;                        using (FileStream fs = new FileStream(@"C:\Users\dragonfive\Desktop\爬蟲獲得船僅僅圖片\第三批\" + ship_id + "_" + i + ".jpg", FileMode.OpenOrCreate, FileAccess.Write))                        {                            fs.Write(jpgdata, 0, jpgdata.Length);                        }                    }                    catch (WebException webEx)                    {                        Console.WriteLine("被捕獲了嗎?");                        Console.WriteLine(webEx.Message.ToString());                    }                    Console.WriteLine("成功下載第" + (i ++) + "張圖片");                    urlindex = pageHtml.IndexOf(org_label, urlindex + 1);                }                ///儲存網頁                //using (StreamWriter sw = new StreamWriter("ouput.html"))//將擷取的內容寫入文本                //{                //    sw.Write(pageHtml);                //}                Console.WriteLine("*****************************************");                Console.WriteLine("下載"+i+"張ship_id為"+ship_id+"的圖片");                Console.WriteLine("*****************************************");                //Console.ReadLine(); //讓控制台暫停,否則一閃而過了             }            catch (WebException webEx)            {                Console.WriteLine(webEx.Message.ToString());            }        }        static void Main(string[] args)        {            List<string> shipid_list = new List<string>();            //shipid_list.Add("371681");//臨時高速產生圖片用這個;            download_all_shipid(shipid_list);            //string file = @"C:\Users\dragonfive\Desktop\爬蟲獲得船僅僅圖片\第三批\0_100page_shipid.txt";            //using (FileStream fsReader = new FileStream(file, FileMode.Open, FileAccess.Read))            //{            //    //以下進行反序列話;            //    BinaryFormatter bf = new BinaryFormatter();            //    shipid_list = (List<string>)bf.Deserialize(fsReader);            //    Console.WriteLine("成功載入" + shipid_list.Count + "個shipid");            //}            ////371652 371668  371681 1252401             //shipid_list.Remove("371652");            //shipid_list.Remove("371668");            //shipid_list.Remove("371681");            //shipid_list.Remove("1252401");            ////132264            //shipid_list.Remove("371077");            //shipid_list.Remove("132264");            //shipid_list.Remove("224871");            //shipid_list.Remove("279923");            //shipid_list.Remove("369163");            //shipid_list.Remove("266342");            //shipid_list.Remove("371216");            //shipid_list.Remove("368174");            //shipid_list.Remove("369163");            foreach (var ship_id in shipid_list)            {                download_jpg(ship_id);            }            Console.ReadLine(); //讓控制台暫停,否則一閃而過了         }    }}

用c#編寫爬蟲在marinetraffic下載船僅僅圖片

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More