蜘蛛程式(廣度優先,c#多線程版本)

來源:互聯網
上載者:User

多線程這裡,我主要是使用全域變數來控制當前線程數量,在每個線程內部,已啟動就進行原子操作增加當前活動線程數量,線程結束時再進行原子操作,減少當前活動線程數量。當隊列為空白並且活動線成為0時,認為任務完成,退出迴圈。如果隊列為空白但是有活動線程則主線程休眠,然後再次判斷條件。隊列線程均不為空白,或隊列不空線程為空白,判斷線程數量並決定是否開啟新線程抓取。

 

 

/*

* XssScan.cs,雲舒,070704下午

*/

using System;

using System.Threading;

using System.Collections;

using System.Collections.Generic;

using System.Text;

using Winista.Text.HtmlParser;

using Winista.Text.HtmlParser.Data;

namespace Ph4nt0m.XssScan

{

    public class XssScan

    {

        public static string        domain;

        public static Hashtable        url_hash;

        public static string[]        excempt_file;

        public static Int32            max_thread = 10;

        public static Int32            current_thread = 0;

        public static Int32            time_out;

        public static Queue            pre_url;

    

        public static int Main(string[] args)

        {

            string base_url;

            // 檢查參數個數

            if ( args.Length != 2 )

            {

                Console.WriteLine( "Usage: XssScan.exe   <url>   <sleep>" );

                return -1;

            }

            base_url = args[0];

            time_out = Int32.Parse(args[1]);

            url_hash = new Hashtable();

            pre_url = new Queue(100);

            // 不進行分析的檔案

            excempt_file = new string[] { ".exe", ".rar", ".zip", ".tar", ".gz",

                                            ".pdf", ".swf", ".jpg", ".png", ".gif", ".bmp",

                                            ".mp3", ".mp4", ".rm", ".rmvb", ".smil", ".wma",

                                            ".pl", ".c", ".cpp"

                                        };

            // 處理url,方便下面截取基地址

            if (base_url.StartsWith("http://"))/

            {

                base_url = base_url.Substring(7);

            }

            if (base_url.EndsWith("/") )

            {

                base_url = base_url.TrimEnd('/');

            }

            else if (base_url.EndsWith("\"))

            {

                base_url = base_url.TrimEnd('\');

            }

            // 按照/字元分割url,擷取網域名稱.

            // 網域名稱取得較長,若包含目錄,則可以防止抓取到上層目錄,此處包含了純網域名稱後面的目錄

            if (base_url.IndexOf('/') == -1)

            {

                domain = base_url;

            }

            else

            {

                string[] domains = base_url.Split('/');

                // 是檔案還是目錄

                if (domains[domains.Length - 1].IndexOf('.') != -1)

                {

                    Int32 pos = base_url.LastIndexOf('/');

                    domain = base_url.Substring(0, pos);

                }

                else

                {

                    domain = base_url;

                }

            }

            base_url = "http://"/ + base_url;

            domain = "http://"/ + domain;

            //Console.WriteLine("base_url: ", base_url);

            //Console.WriteLine( "domain: ", domain);

            // 將基URL加入到隊列並開始抓取

            //pre_url.Enqueue(base_url);

            Ph4nt0m.XssScan.Parser parser_base = new Ph4nt0m.XssScan.Parser(base_url);

            parser_base.GetLinksFromUrl();

            while ( true )

            {

                // 沒有活動線程且隊列為空白,則說明抓取完成

                if ( current_thread == 0 && pre_url.Count == 0 )

                {

                    break;

                }

                // 隊列為空白但是有活動線程則主線程休眠,然後再次判斷條件

                if (pre_url.Count == 0)

                {

                    Thread.Sleep(100);

                    continue;

                }

                // 隊列線程均不為空白,或隊列不空線程為空白,判斷線程數量並決定是否開啟新線程抓取

                if (current_thread < max_thread)

                {

                    string current_url = (string)pre_url.Dequeue();

                    Ph4nt0m.XssScan.Parser parser = new Ph4nt0m.XssScan.Parser(current_url);

                    Thread work_thread = new Thread(new ThreadStart(parser.GetLinksFromUrl));

                    work_thread.Start();

                }

                else

                {

                    Console.WriteLine("休眠主線程,當前線程數量為: ", current_thread);

                }

                Thread.Sleep(time_out);

            }

            Console.WriteLine("All done.\nThere are links:", url_hash.Count);

            foreach (string key in url_hash.Keys)

            {

                //Console.WriteLine(key);

            }

            return 0;

        }

    }

}

執行工作的類,C#比較變態的是不能給線程啟動的函數傳遞參數,所以無賴之下我使用建構函式來設定變數,也就是需要抓取的URL地址:

代碼:

 

/*

* Parser.cs,雲舒,雲舒,070704下午

*/

using System;

using System.Collections;

using System.Collections.Generic;

using System.Text;

using System.Threading;

using Winista.Text.HtmlParser;

using Winista.Text.HtmlParser.Data;

namespace Ph4nt0m.XssScan

{

    public class Parser

    {

        private string _url;

        public Parser(string url)

        {

            this._url = url;

        }

        /// <summary>

        /// 從給定的連結中擷取網頁內容,並擷取該網頁內的所有站內連結

        /// </summary>

        /// <param name="url">url地址,string</param>

        public void GetLinksFromUrl( )

        {

            // 遞增當前線程數量

            Interlocked.Increment(ref XssScan.current_thread);

            Winista.Text.HtmlParser.Parser parser;

            PageData page_data;

            Hashtable tmp_hash = new Hashtable();

            string url = CheckUrl(_url);

            parser = new Winista.Text.HtmlParser.Parser(new Uri(url));

            // 解析html

            page_data = parser.GetAllOutLinks(1, true);

        

            Int32 excempt_found = 0;

            foreach (LinkData link_data in page_data.OutLinks)

            {

                //Console.WriteLine( "[DEBUG1]: ", link_data.Url );

                // 跳過二進位檔案

                foreach (string ext_name in XssScan.excempt_file)

                {

                    if (link_data.Url.EndsWith(ext_name))

                    {

                        //Console.WriteLine("[DEBUG2]: igonre ", link_data.Url, ext_name);

                        excempt_found = 1;

                        break;

                    }

                }

                

                if( excempt_found == 1 )

                {

                    // 重設標記

                    excempt_found = 0;

                    continue;

                }

                //Console.WriteLine("[DEBUG3]: ", link_data.Url);

                // 是否抓出邊界,這裡先判斷邊界,儘可能少的遍曆hash

                // 仔細權衡,感覺檢查邊界放在這裡更節省資源

                if (link_data.Url.StartsWith(XssScan.domain))

                {

                    // URL記錄中是否已經包含了此URL

                    if (!XssScan.url_hash.ContainsKey(link_data.Url))

                    {

                        Console.WriteLine("Url: ", link_data.Url);

                        try

                        {

                            // 加鎖,確保只有一個線程能寫URL HASH和隊列

                            Monitor.Enter(this);

                            // 將URL加入到全域的URL列表中

                            XssScan.url_hash[link_data.Url] = "true";

                            // 將URL加入到確保同時只有一個線程更新URL隊列

                            XssScan.pre_url.Enqueue(link_data.Url);

                        }

                        finally

                        {

                            // 釋放鎖

                            Monitor.Exit(this);

                        }

                    }

                }

            }

            // 減少當前線程數量

            Interlocked.Decrement(ref XssScan.current_thread);

        }

        private static string CheckUrl(string url)

        {

            // 在url後面加上/字元,這個似乎是winista.HtmlParser的bug,如果url包含目錄而沒有以/結尾,它會

            // 認為這個是檔案。

            if (!url.EndsWith("/"))

            {

                string[] tokens = url.Split('/');

                Int32 count = tokens.Length;

                // .ddd和ddd.均當作目錄處理

                if (tokens[count - 1].StartsWith(".") || tokens[count - 1].EndsWith("."))

                {

                    url += '/';

                }

                // 如果中間不包含"."也當成目錄,加上"/"字元

                else if (tokens[count - 1].IndexOf('.') == -1)

                {

                    url += '/';

                }

            }

            //Console.WriteLine(url);

            return url;

        }

    }

}

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.