/// <summary> ///Crawl product information from the official website in the local database/// </summary> /// <returns></returns> PublicList<productmessage>Getlistproductmessage () {stringhtml = getproductsdescriptionsimage ("http://www.grandcanyononepoint.com/products"); HTMLDocument Document=NewHTMLDocument (); Document. loadhtml (HTML); Htmlnode RootNode=document. Documentnode; /*//*[@class = ' list-product ') marks an instance of an element's XPath, * represents all nodes that use class= "List-product"*/htmlnodecollection rootnodelist= Rootnode.selectnodes ("//*[@class = ' list-product ')"); List<ProductMessage> products =NewList<productmessage>(); foreach(Htmlnode nodeinchrootnodelist) {Productmessage db_product=NewProductmessage (); HTMLDocument Docu=NewHTMLDocument (); Docu. Loadhtml (node. InnerHtml); Htmlnode ro=Docu. Documentnode; Db_product. Code= Formsub (ro. selectSingleNode ("//*[@style = ' float:right; ']"). InnerText); stringCode =db_product. Code; List<ProductMessage> productlist = productmessage.getproductlist (Code,""); if(productlist.count>0) {db_product. Name= Formsub (ro. selectSingleNode ("//*[@style = ' float:left; ']"). InnerText); /*gets the attribute value of the href tag in the A node*/db_product.id= GetProductID (ro. selectSingleNode ("a"). attributes["href"]. Value); stringdescmationhtml = Getproductsdescriptionsimage ("http://www.grandcanyononepoint.com/products/view/"+ Db_product.id +""); HTMLDocument Descmationdo=NewHTMLDocument (); Descmationdo.loadhtml (descmationhtml); Htmlnode Descmationnode=Descmationdo.documentnode; Db_product. Descmation= Formsub (Descmationnode.selectsinglenode ("//*[@class = ' Product-desc ')"). InnerHtml). Replace ("'",""); if(Descmationnode.selectsinglenode ("//*[@class = ' details-tile ')") !=NULL) {db_product. Departingfrom= Formsub (Descmationnode.selectsinglenode ("//*[@class = ' details-tile ')"). Innerhtml.replace ("departing from","")); } if(Descmationnode.selectsinglenode ("//*[@class = ' details-tile details-list ']") !=NULL) {db_product. Producthighlights= Formsub (Descmationnode.selectsinglenode ("//*[@class = ' details-tile details-list ']"). Innerhtml.replace ("Product Highlights","")). Replace ("'",""); } #region Try{productmessage.updatewebproductmessage (db_product). Descmation,db_product. Departingfrom,db_product. Producthighlights,db_product. Name,db_product. Code); } Catch { } #endregion #region if(Descmationnode.selectsinglenode ("//*[@class = ' product-equip ')") !=NULL) {HTMLDocument desmationequipment=NewHTMLDocument (); Desmationequipment.loadhtml (Descmationnode.selectsinglenode ("//*[@class = ' product-equip ')"). InnerHtml); Htmlnode Equipmentnode=Desmationequipment.documentnode; Htmlnodecollection Equipmentnodes= Equipmentnode.selectnodes ("Div"); List<EquipmentModel> equipmentstring =NewList<equipmentmodel>(); foreach(Htmlnode Equipmentinchequipmentnodes) {Equipmentmodel Equipment_model=NewEquipmentmodel (); Equipment_model. Name= equipment. attributes["title"]. Value; Equipment_model. IMAGEURL="/papillon/equipmentimage/"+ Equipment. attributes["title"]. Value +". PNG"; Try{productmessage.insertproductequipment (db_product). ID, Equipment_model. Name, Equipment_model. IMAGEURL); } Catch{} equipmentstring.add (Equipment_model); } db_product. Equipment=equipmentstring; } #endregion #region if(Descmationnode.selectnodes ("//*[@title = ' See full size image ']") !=NULL) {htmlnodecollection imagenodes= Descmationnode.selectnodes ("//*[@title = ' See full size image ']"); List<ImageModel> imagestring =NewList<imagemodel>(); foreach(Htmlnode Imagenodeinchimagenodes) {Imagemodel Image_model=NewImagemodel (); HTMLDocument Imagedo=NewHTMLDocument (); Imagedo.loadhtml (Imagenode. InnerHtml); Htmlnode Imgro=Imagedo.documentnode; //Original image Address stringFrompath ="http://www.grandcanyononepoint.com"+ Imgro.selectsinglenode ("img"). attributes["src"]. Value; Image_model. IMAGEURL=Frompath; Try{productmessage.insertproductimage (db_product.id, Image_model. IMAGEURL); } Catch { } } } #endregionProducts . ADD (db_product); } } returnProducts ; }
View Code
XPath is to get HTML as XML-like format, mainly through the different points of the node, get different content, can get the desired data from the Web page, and the web crawler is different.
Use XPath to get data from a Web page