HtmlAgilityPack C#爬蟲

來源:互聯網
上載者:User

標籤:end   name   eval   程式   res   bst   tput   ace   div   

Main程式
  class Program    {        static void Main(string[] args)        {            var tmpStr1 = "http://www.****.com/txtxz/{0}/down.html";            WebClient wc = new WebClient();            for (int i = 54422; i < 54423; i++)            {                var str = String.Format(tmpStr1, i);                String result = wc.DownloadString(str);                BookDataParser parser = new BookDataParser(result, str);                var item = parser.GetBookData();                if (item == null)                {                    Console.WriteLine("fail to get in url{0}", i);                    continue;                }                wc.DownloadFile(item.DownLink, item.Title + ".txt");                Console.WriteLine(item);            }            Console.ReadKey();        }    }
BookDataParser.cs
/// <summary>    /// BookData解析    /// </summary>    public class BookDataParser     {        private String _content;    //html 內容        private HtmlDocument _doc;  //HtmlAgilityPack 處理文檔類        private String _url;        //當前連結,僅用於構造BookData        /// <summary>        /// 構造BookDataParser        /// </summary>        /// <param name="content">Html內容</param>        /// <param name="url">當前連結</param>        public BookDataParser(string content,string url)        {            _url = url;            _content = content;            _doc = new HtmlDocument();            _doc.LoadHtml(content);        }        /// <summary>        /// 解析返回bookData        /// </summary>        /// <returns></returns>        public BookData GetBookData()        {            if (!VaildContent()) { return null; }            var bookData = new BookData();            //Title            var titleNode = _doc.DocumentNode.SelectSingleNode("//div[@id=‘titlename‘]/h1");            var str = titleNode.ChildNodes.First(n => n.Name == "#text").InnerHtml;            bookData.Title = str.Substring(0, str.IndexOf("TXT"));            var infoNode = _doc.DocumentNode.SelectNodes("//div[@class=‘txt_info‘]/span");            //Author            var str2 = infoNode[0].InnerText;            bookData.AuthorName = str2.Substring(str2.IndexOf(‘:‘)+1);            //class            var str3 = infoNode[1].InnerText;            bookData.Class = str3.Substring(str3.IndexOf(":") + 1);            //date            var str4 = infoNode.Last().InnerText;            bookData.UploadTime = DateTime.Parse(str4.Substring(str4.IndexOf(":") + 1));            //Description            var descNode = _doc.DocumentNode.SelectSingleNode("//div[@class=‘infos_txt‘]");            bookData.Description = GetDesc(descNode);            //DownLink            var linkNode1 = _doc.DocumentNode.SelectSingleNode("//div[@class=‘pan_url‘]/a[last()]");            bookData.DownLink = linkNode1.GetAttributeValue("href", null);            //InfoLink            bookData.InfoLink = _url;            return bookData;        }        /// <summary>        /// 處理descNode獲得描述        /// </summary>        /// <param name="descNode">描述資訊的節點</param>        /// <returns>book 描述</returns>        private string GetDesc(HtmlNode descNode)        {            StringBuilder sb = new StringBuilder();            foreach (var node in descNode.ChildNodes)            {                if (node.Name=="#text")                {                    var str = node.InnerText;                    if (!String.IsNullOrWhiteSpace(str))                    {                        sb.Append(HttpUtility.HtmlDecode(str).Trim());                    }                }                if (node.Name=="br")                {                    sb.Append("\n");                }            }            return sb.ToString();        }        /// <summary>        /// 驗證Content為空白或不合法        /// </summary>        /// <returns></returns>        public bool VaildContent()        {            if (_content == null) return false;            var node = _doc.DocumentNode.SelectSingleNode("//div[class=‘blocktitle‘]");            return node == null || node.InnerText.StartsWith("出現錯誤");        }    }

 

HtmlAgilityPack C#爬蟲

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.