C#網路爬蟲與搜尋引擎調研的代碼詳情介紹

來源:互聯網
上載者:User
效果頁面:

大致思路:

一個入口連結,例如:www.sina.com.cn,從它入手開始爬,找到了連結,(在此可以解析出網頁內容,輸入一個關鍵字,判讀是否包含輸入的關鍵字,包含就把這個連結以及網頁相關內容放入緩衝),把爬到的串連放入緩衝,遞迴執行。

做的比較簡陋,算是自己總結一下。

同時啟動10個線程,每個線程對應各自的串連池緩衝,把包含關鍵字的串連都放入同一個緩衝裡面,準備一個service頁面,定時重新整理,顯示當前的結果(僅僅是類比,真正的搜尋引擎一定是先用分詞法對關鍵字進行解析,然後結合網頁內容把合格網頁和串連存到檔案裡面,下次搜尋的時候一定是從檔案裡面找結果,它們的爬蟲24小時爬)。下面看一下具體實現。

實體類:

using System;using System.Collections.Generic;using System.Linq;using System.Web;using System.Threading;namespace SpiderDemo.Entity{////爬蟲線程    publicclass ClamThread    {       public Thread _thread { get; set; }       public List<Link> lnkPool { get; set; }} ////爬到的串連  publicclass Link    {       public string Href { get; set; }       public string LinkName { get; set; }       public string Context { get; set; }        public int TheadId { get; set; }    } }

緩衝類:

using System;using System.Collections.Generic;using System.Linq;using System.Web;using SpiderDemo.Entity;using System.Threading; namespace SpiderDemo.SearchUtil{   public static class CacheHelper    {       public static bool EnableSearch;        /// <summary>       /// 起始URL       /// </summary>       public const string StartUrl = "http://www.sina.com.cn";         /// <summary>       /// 爬取的最大數量,效能最佳化一下,如果可以及時釋放資源就可以一直爬了       /// </summary>       public const int MaxNum = 300;        /// <summary>       /// 最多爬出1000個結果       /// </summary>       public const int MaxResult = 1000;         /// <summary>       /// 當前爬到的數量       /// </summary>       public static int SpideNum;        /// <summary>       /// 關鍵字        /// </summary>       public static string KeyWord;        /// <summary>       /// 已耗用時間       /// </summary>       public static int RuningTime;        /// <summary>       /// 最多已耗用時間       /// </summary>       public static int MaxRuningtime;        /// <summary>       /// 10個線程同時去爬       /// </summary>       public static ClamThread[] ThreadList = new ClamThread[10];        /// <summary>       /// 第一次爬到的串連,串連池       /// </summary>       public static List<Link> LnkPool = new List<Link>();        /// <summary>       /// 拿到的合法串連       /// </summary>       public static List<Link> validLnk = new List<Link>();        /// <summary>       /// 拿串連的時候  不要拿同樣的       /// </summary>       public static readonly object syncObj = new object();    }}

HTTP請求類:

using System;using System.Collections.Generic;using System.Linq;using System.Web;using System.Text;using System.Net;using System.IO;using System.Threading; namespace SpiderDemo.SearchUtil{   public static class HttpPostUtility    {       /// <summary>       /// 暫時寫成同步的吧,等後期再最佳化       /// </summary>       /// <param name="url"></param>       /// <returns></returns>       public static Stream SendReq(string url)       {           try           {                if (string.IsNullOrEmpty(url)){                    return null;                }                // WebProxy wp = newWebProxy("10.0.1.33:8080");                //wp.Credentials = new System.Net.NetworkCredential("*****","******", "feinno");///之前需要使用代理才能                 HttpWebRequest myRequest =(HttpWebRequest)WebRequest.Create(url);                //myRequest.Proxy = wp;                HttpWebResponse myResponse =(HttpWebResponse)myRequest.GetResponse();                 returnmyResponse.GetResponseStream();           }           ////給一些網站發請求許可權會受到限制           catch (Exception ex)           {                return null;           }       }    }}

解析網頁類,這裡用到了一個組件,HtmlAgilityPack.dll,很好用,下載串連:http://www.php.cn/

using System;using System.Collections.Generic;using System.Linq;using System.Web;using System.Threading;using System.Text;using System.Xml;using System.Xml.Linq;using HtmlAgilityPack;using System.IO;using SpiderDemo.Entity;namespace SpiderDemo.SearchUtil{    public static class UrlAnalysisProcessor    {        public static void GetHrefs(Link url, Stream s, List<Link>lnkPool)       {           try           {                ////沒有HTML流,直接返回                if (s == null)                {                    return;                }                 ////解析出串連往緩衝裡面放,等著前面頁面來拿,目前每個線程最多緩衝300個,多了就別存了,那邊取的太慢了!                if (lnkPool.Count >=CacheHelper.MaxNum)                {                    return;                }                 ////載入HTML,找到了HtmlAgilityPack,試試這個組件怎麼樣                HtmlAgilityPack.HtmlDocumentdoc = new HtmlDocument();                 ////指定了UTF8編碼,理論上不會出現中文亂碼了                doc.Load(s, Encoding.Default);                 /////獲得所有串連                IEnumerable<HtmlNode> nodeList=doc.DocumentNode.SelectNodes("//a[@href]");////抓串連的方法,詳細去看stackoverflow裡面的:////http://www.php.cn/                 ////移除指令碼                foreach (var script indoc.DocumentNode.Descendants("script").ToArray())                    script.Remove();                 ////移除樣式                foreach (var style indoc.DocumentNode.Descendants("style").ToArray())                    style.Remove();                 string allText =doc.DocumentNode.InnerText;                int index = 0;                ////如果包含關鍵字,為合格串連                if ((index =allText.IndexOf(CacheHelper.KeyWord)) != -1)                {                    ////把包含關鍵字的上下文取出來,取40個字元吧                    if (index > 20&& index < allText.Length - 20 - CacheHelper.KeyWord.Length)                    {                        string keyText =allText.Substring(index - 20, index) +                          "<spanstyle='color:green'>" + allText.Substring(index,CacheHelper.KeyWord.Length) + "</span> " +                           allText.Substring(index +CacheHelper.KeyWord.Length, 20) + "<br />";////關鍵字反白                         url.Context = keyText;                    }                     CacheHelper.validLnk.Add(url);                   //RecordUtility.AppendLog(url.LinkName + "<br />");                    ////爬到了一個合格串連,計數器+1                    CacheHelper.SpideNum++;                }                 foreach (HtmlNode node innodeList)                {                    if(node.Attributes["href"] == null)                   {                        continue;                    }                    else                    {                         Link lk = new Link()                        {                            Href =node.Attributes["href"].Value,                            LinkName ="<a href='" + node.Attributes["href"].Value +                            "'target='blank' >" + node.InnerText + "  " +                           node.Attributes["href"].Value + "</a>" +"<br />"                        };                        if(lk.Href.StartsWith("javascript"))                        {                            continue;                        }                        else if(lk.Href.StartsWith("#"))                        {                           continue;                        }                        else if(lnkPool.Contains(lk))                        {                            continue;                        }                        else                        {                            ////添加到指定的串連池裡面                            lnkPool.Add(lk);                         }                    }                }              }            catch (Exception ex)           {            }       }    }}

搜尋網頁面CODE BEHIND:

using System;using System.Collections.Generic;using System.Linq;using System.Web;using System.Web.UI;using System.Web.UI.WebControls;using SpiderDemo.SearchUtil;using System.Threading;using System.IO;using SpiderDemo.Entity; namespace SpiderDemo{   public partial class SearchPage : System.Web.UI.Page    {        protected void Page_Load(object sender, EventArgs e)       {           if (!IsPostBack)           {                InitSetting();           }       }        private void InitSetting()       {                }        private void StartWork()       {           CacheHelper.EnableSearch = true;           CacheHelper.KeyWord = txtKeyword.Text;            ////第一個請求給新浪,獲得返回的HTML流           Stream htmlStream = HttpPostUtility.SendReq(CacheHelper.StartUrl);            Link startLnk = new Link()           {                Href = CacheHelper.StartUrl,                LinkName = "<a href ='" + CacheHelper.StartUrl + "' > 新浪 " +CacheHelper.StartUrl + " </a>"           };            ////解析出串連           UrlAnalysisProcessor.GetHrefs(startLnk, htmlStream,CacheHelper.LnkPool);                                  for (int i = 0; i < CacheHelper.ThreadList.Length; i++)           {                CacheHelper.ThreadList[i] = newClamThread();               CacheHelper.ThreadList[i].lnkPool = new List<Link>();           }            ////把串連平分給每個線程           for (int i = 0; i < CacheHelper.LnkPool.Count; i++)           {                int tIndex = i %CacheHelper.ThreadList.Length;               CacheHelper.ThreadList[tIndex].lnkPool.Add(CacheHelper.LnkPool[i]);           }            Action<ClamThread> clamIt = new Action<ClamThread>((clt)=>           {                 Stream s =HttpPostUtility.SendReq(clt.lnkPool[0].Href);                DoIt(clt, s, clt.lnkPool[0]);           });             for (int i = 0; i < CacheHelper.ThreadList.Length; i++)           {               CacheHelper.ThreadList[i]._thread = new Thread(new ThreadStart(() =>                {                   clamIt(CacheHelper.ThreadList[i]);                }));                 /////每個線程開始工作的時候,休眠100ms               CacheHelper.ThreadList[i]._thread.Start();                Thread.Sleep(100);           }                 }        private void DoIt(ClamThreadthread, Stream htmlStream, Link url)       {            if (!CacheHelper.EnableSearch)           {                return;           }            if (CacheHelper.SpideNum > CacheHelper.MaxResult)           {               return;           }            ////解析頁面,URL符合條件放入緩衝,並把頁面的串連抓出來放入緩衝           UrlAnalysisProcessor.GetHrefs(url, htmlStream, thread.lnkPool);            ////如果有串連,拿第一個發請求,沒有就結束吧,反正這麼耗資源的東西           if (thread.lnkPool.Count > 0)           {                Link firstLnk;                firstLnk = thread.lnkPool[0];                ////拿到串連之後就在緩衝中移除               thread.lnkPool.Remove(firstLnk);                 firstLnk.TheadId =Thread.CurrentThread.ManagedThreadId;               Stream content =HttpPostUtility.SendReq(firstLnk.Href);                 DoIt(thread, content,firstLnk);           }           else           {                //沒串連了,停止吧,看其他線程的表現                thread._thread.Abort();           }       }        protected void btnSearch_Click(object sender, EventArgs e)       {           this.StartWork();        }        protected void btnShow_Click(object sender, EventArgs e)       {        }        protected void btnStop_Click(object sender, EventArgs e)       {           foreach (var t in CacheHelper.ThreadList)           {                t._thread.Abort();               t._thread.DisableComObjectEagerCleanup();           }           CacheHelper.EnableSearch =false;           //CacheHelper.ValidLnk.Clear();           CacheHelper.LnkPool.Clear();           CacheHelper.validLnk.Clear();       }    }}

搜尋網頁面前台代碼:

<%@ Page Language="C#"AutoEventWireup="true" CodeBehind="SearchPage.aspx.cs"Inherits="SpiderDemo.SearchPage" %> <!DOCTYPE html PUBLIC "-//W3C//DTDXHTML 1.0 Transitional//EN""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <htmlxmlns="http://www.w3.org/1999/xhtml"><head runat="server">   <title></title></head><body>   <form id="form1" runat="server">   <p>    關鍵字:<asp:TextBoxrunat="server" ID="txtKeyword" ></asp:TextBox>   <asp:Button runat="server" ID="btnSearch"Text="搜尋" onclick="btnSearch_Click"/>            <asp:Button runat="server" ID="btnStop"Text="停止" onclick="btnStop_Click" />      </p>   <p>      <iframe width="800px" height="700px"src="ShowPage.aspx">    </iframe>  </p>    </form></body></html>  ShowPage.aspx(嵌在SearchPage裡面,ajax請求一個handler): <%@ Page Language="C#"AutoEventWireup="true" CodeBehind="ShowPage.aspx.cs"Inherits="SpiderDemo.ShowPage" %> <!DOCTYPE html PUBLIC "-//W3C//DTDXHTML 1.0 Transitional//EN""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head runat="server">   <title></title>   <script src="js/jquery-1.6.js"></script></head><body>   <form id="form1" runat="server">   <p>          </p>   <p id="pRet">          </p>   <script type="text/javascript">        $(document).ready(       function () {            var timer = setInterval(       function () {            $.ajax({                type: "POST",                url:"http://localhost:26820/StateServicePage.ashx",                data: "op=info",                success: function (msg) {                                  $("#pRet").html(msg);                }           });       }, 2000);         });   </script>   </form></body></html>

StateServicePage.cs

using System;using System.Collections.Generic;using System.Linq;using System.Web;using System.Text;using SpiderDemo.SearchUtil;using SpiderDemo.Entity; namespace SpiderDemo{   /// <summary>   /// StateServicePage 的摘要說明   /// </summary>   public class StateServicePage : IHttpHandler    {        public void ProcessRequest(HttpContext context)       {           context.Response.ContentType = "text/plain";             if (context.Request["op"] != null &&context.Request["op"] == "info")           {               context.Response.Write(ShowState());           }       }         public string ShowState()       {           StringBuilder sbRet = new StringBuilder(100);           string ret = GetValidLnkStr();            int count = 0;                           for (int i = 0; i <CacheHelper.ThreadList.Length; i++)                {                    if(CacheHelper.ThreadList[i] != null && CacheHelper.ThreadList[i].lnkPool!= null)                    count += CacheHelper.ThreadList[i].lnkPool.Count;                }                      sbRet.AppendLine("服務是否運行 : " + CacheHelper.EnableSearch + "<br />");           sbRet.AppendLine("串連池總數: " + count + "<br />");           sbRet.AppendLine("搜尋結果:<br /> " + ret);            return sbRet.ToString();       }        private string GetValidLnkStr()       {           StringBuilder sb = new StringBuilder(120);           Link[] cloneLnk = new Link[CacheHelper.validLnk.Count];            CacheHelper.validLnk.CopyTo(cloneLnk, 0);            for (int i = 0; i < cloneLnk.Length; i++)           {                sb.AppendLine("<br/>" + cloneLnk[i].LinkName + "<br />" +cloneLnk[i].Context);           }             return sb.ToString();       }         public bool IsReusable       {           get           {                return false;           }       }    }}

以上就是C#網路爬蟲與搜尋引擎調研的代碼詳情介紹的內容,更多相關內容請關注topic.alibabacloud.com(www.php.cn)!

  • 相關文章

    聯繫我們

    該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

    如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.