Htmlagilitypack
You can use Htmlagilitypack to get page elements by looking up an HTML node in an object-oriented manner. Reference: Http://html-agility-pack.net
HTMLDocument class
//Method
Loadhtml (stringcontent);
//Loading HTML Data
CreateNode (stringhtml
//Create a Htmlnode
CreateAttribute (stringAttriname,stringAttrival)
//Create a property
createTextNode (stringText
//Create Text
//Other references: Http://html-agility-pack.net/utilities and Http://html-agility-pack.net/writer
//Properties
Documentnode
//Get root node, return a Htmlnode
Htmlnode class//Method
SelectNodes (stringXPath)
//Gets the XPath specified node collection
selectSingleNode (stringXPath)
//Gets the first node in the collection of XPath specified nodes
Setattributevalue (HtmlattributeAttri|stringAttriname,stringAttrival)
//Modify the properties of the current node
PrependChild (HtmlnodeNode
//Insert a new child node at the beginning of the current node
AppendChild (HtmlnodeNode
//Inserts a new child node at the end of the current node
Prependchildren (HtmlnodeNode
//Insert a descendant node at the beginning of the child node at the beginning of the current node
Appendchildren (HtmlnodeNode
inserts a descendant node at the end of a child node at the end of the current node
InsertAfter (HtmlnodeNode
//Insert a new node after the current node, similar to InsertBefore
Remove ()
//Remove itself
RemoveAll ()
//Remove all nodes that are contained by themselves
Removeallchildren ()
//Remove all nodes that are contained by themselves
RemoveChild (HtmlnodeOldchild|HtmlnodeOldchild,BOOLKeepgrandchildren)
//keepgrandchildren: Whether to deeply remove descendant nodes
//Remove the node specified by the parameter that itself contains
ReplaceChild (HtmlnodeNewChild,HtmlnodeOldchild);
//In the current node, replace Oldchild with Newchild
Clone ()
//Create a copy
CloneNode (BOOLDeep|stringName|stringNameBOOLDeep
//deep: Whether deep cloning, if false, only the node itself is cloned
//name: Clone the tag name of the node at the same time, such as cloning H1 content at the same time to get rid of the result node name is H2
CopyFrom (HtmlnodeNode|HtmlnodeNodeBOOLDeep
//deep: Whether the deep copy, if False, copies only the node itself
//Copy all the nodes it contains from the node specified by the parameter
Element ()
gets a child node in the current node based on the name specified by the parameter, returning a single htmlnode
Elements ()
gets a collection of child nodes in the current node based on the name specified by the parameter, returning the Ienumerable
Ancestors (stringName
gets the ancestor node of the name specified by the parameter of the current node, and gets the collection of all ancestor nodes of the current node if the argument is empty, similar to Ancestorsandself (), Ancestorsandself (string name)
Descendants (stringName
gets the descendant node of the name specified by the parameter of the current node, and gets the collection of all descendant nodes of the current node if the argument is empty, similar to Descendantsandself (), Descendantsandself (string name)
Descendantnodes ()
//Get all descendant nodes, similar to Descendantnodesandself ()
//Properties
InnerHtml
//Set or get the element node that the current node contains
InnerText
//Set or get the text that the current node contains
outerHTML
//Get all the code that the current node contains
ParentNode
//Gets the parent node of the current node
ChildNodes
//Get all child nodes, similar to FirstChild, LastChild, NextSibling, ParentNode
Htmlattribute class
//Method
ADD (HtmlattributeAttri|stringAttriname,stringAttrival)
//Add an attribute, similar to append ()
Remove (stringAttriname)
//This method requires a property collection call, removes the attribute specified by the parameter from the node, and removes all properties if the argument is empty, similar to RemoveAll (), RemoveAt (int index)
Example:
PrivatevoidTest ()
{
HTMLDocumentDoc=NewHTMLDocument();
//Three ways to create HTML objects
//The first type:
//system.io.streamreader reader = System.IO.File.OpenText (URL);
//doc. Load (reader);
//second type:
//doc. loadhtml (URL); If you use the load (URL), the prompt document contains illegal characters
//Third type:
//string htmltext = "<div>xxx</div>";
//doc. Loadhtml (htmltext);
stringUrl="Http://www.weather.com.cn/weathern/101040100.shtml";
WebClientWc=NewWebClient();
Wc.Encoding=Encoding.GetEncoding ("Utf-8");
stringContent=Wc.downloadstring (URL);
Doc.loadhtml (content);
HtmlnodeRoot=Doc.Documentnode;//Get root node
varDiv=Root.SelectNodes ("//div").Where (d=D.InnerText.Contains ("Sam")).Singleordefault ();
}
ASP. NET MVC-processing HTML data