使用Xpath從網頁中擷取資料

來源:互聯網
上載者:User

標籤:

 /// <summary>        /// 從官方網站中抓取產品資訊存放在本機資料庫中        /// </summary>        /// <returns></returns>        public List<ProductMessage> GetlistProductMessage()        {            string html = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products");            HtmlDocument document = new HtmlDocument();            document.LoadHtml(html);            HtmlNode rootNode = document.DocumentNode;            /*//*[@class=‘list-product‘]為元素的XPath標記執行個體,             * 表示所有使用class="list-product"的節點             */            HtmlNodeCollection rootNodeList = rootNode.SelectNodes("//*[@class=‘list-product‘]");            List<ProductMessage> products = new List<ProductMessage>();            foreach (HtmlNode node in rootNodeList)            {                ProductMessage db_product = new ProductMessage();                HtmlDocument docu = new HtmlDocument();                docu.LoadHtml(node.InnerHtml);                HtmlNode ro = docu.DocumentNode;                db_product.Code = Formsub(ro.SelectSingleNode("//*[@style=‘float:right;‘]").InnerText);                string Code = db_product.Code;                    List<ProductMessage> Productlist = ProductMessage.GetProductList(Code,"");                                    if (Productlist.Count>0)                    {                        db_product.Name = Formsub(ro.SelectSingleNode("//*[@style=‘float:left;‘]").InnerText);                        /*擷取a節點中href標籤的屬性值*/                        db_product.ID = GetProductID(ro.SelectSingleNode("a").Attributes["href"].Value);                        string descmationhtml = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products/view/" + db_product.ID + "");                        HtmlDocument descmationDo = new HtmlDocument();                        descmationDo.LoadHtml(descmationhtml);                        HtmlNode descmationNode = descmationDo.DocumentNode;                        db_product.Descmation = Formsub(descmationNode.SelectSingleNode("//*[@class=‘product-desc‘]").InnerHtml).Replace("‘", "");                        if (descmationNode.SelectSingleNode("//*[@class=‘details-tile‘]") != null)                        {                            db_product.DepartingFrom = Formsub(descmationNode.SelectSingleNode("//*[@class=‘details-tile‘]").InnerHtml.Replace("Departing From", ""));                        }                        if (descmationNode.SelectSingleNode("//*[@class=‘details-tile details-list‘]") != null)                        {                            db_product.ProductHighlights = Formsub(descmationNode.SelectSingleNode("//*[@class=‘details-tile details-list‘]").InnerHtml.Replace("Product Highlights", "")).Replace("‘", "");                        }                        #region                        try                        {                            ProductMessage.UpdateWEBProductMessage(db_product.Descmation,db_product.DepartingFrom,db_product.ProductHighlights,db_product.Name,db_product.Code);                        }                        catch { }                        #endregion                        #region                        if (descmationNode.SelectSingleNode("//*[@class=‘product-equip‘]") != null)                        {                            HtmlDocument DesmationEquipment = new HtmlDocument();                            DesmationEquipment.LoadHtml(descmationNode.SelectSingleNode("//*[@class=‘product-equip‘]").InnerHtml);                            HtmlNode EquipmentNode = DesmationEquipment.DocumentNode;                            HtmlNodeCollection EquipmentNodes = EquipmentNode.SelectNodes("div");                            List<EquipmentModel> EquipmentString = new List<EquipmentModel>();                            foreach (HtmlNode equipment in EquipmentNodes)                            {                                EquipmentModel Equipment_model = new EquipmentModel();                                Equipment_model.Name = equipment.Attributes["title"].Value;                                Equipment_model.ImageUrl = "/Papillon/EquipmentImage/" + equipment.Attributes["title"].Value + ".png";                                try                                {                                    ProductMessage.InsertProductEquipment(db_product.ID, Equipment_model.Name, Equipment_model.ImageUrl);                                }                                catch { }                                EquipmentString.Add(Equipment_model);                            }                            db_product.Equipment = EquipmentString;                        }                        #endregion                        #region                        if (descmationNode.SelectNodes("//*[@title=‘See full size image‘]") != null)                        {                            HtmlNodeCollection ImageNodes = descmationNode.SelectNodes("//*[@title=‘See full size image‘]");                            List<ImageModel> ImageString = new List<ImageModel>();                            foreach (HtmlNode imagenode in ImageNodes)                            {                                ImageModel image_model = new ImageModel();                                HtmlDocument imageDo = new HtmlDocument();                                imageDo.LoadHtml(imagenode.InnerHtml);                                HtmlNode imgRo = imageDo.DocumentNode;                                //原圖片地址                                string FromPath = "http://www.grandcanyononepoint.com" + imgRo.SelectSingleNode("img").Attributes["src"].Value;                                image_model.ImageUrl = FromPath;                                try                                {                                    ProductMessage.InsertProductImage(db_product.ID, image_model.ImageUrl);                                }                                catch { }                            }                        }                        #endregion                        products.Add(db_product);                    }            }            return products;        }
View Code

 

 

Xpath是將html作為類似xml的格式進行擷取的,主要通過節點的不同標示,擷取不同內容,可以從網頁中擷取想要的資料,與網頁爬蟲不同。

使用Xpath從網頁中擷取資料

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.