In the development of search engines, we need to search the HTML content of webpages, and HTML Parsing is inevitable. Split each node and obtain the content between nodes. This article introduces two methods for C # parsing HTML.
Method 1:
Use System. net. WebClient to download the web page to a local file or string, and use a regular expression for analysis. This method can be used in web crawlers and other applications that need to analyze many web pages.
It is estimated that this is the most direct and easy way to think.
An instance transferred from the Internet: All href are extracted:
Code
Using system;
Using system. net;
Using system. text;
Using system. Text. regularexpressions;
Namespace httpget
{
Class class1
{
[Stathread]
Static void main (string [] ARGs)
{
System. net. WebClient client = new WebClient ();
Byte [] Page = client. downloaddata ("http://www.google.com ");
String content = system. Text. encoding. utf8.getstring (PAGE );
String RegEx = "href = [\" \ '] (http: \/| \. \/| \\/)? \ W + (\. \ W +) * (\/\ W + (\. \ W + )?) * (\\/ | \\? \ W * = \ W * (& \ W * = \ W *)*)? [\\\ "\\\ ']";
RegEx Re = new RegEx (RegEx );
Matchcollection matches = Re. Matches (content );
System. Collections. ienumerator enu = matches. getenumerator ();
While (enu. MoveNext () & enu. Current! = Null)
{
Match match = (Match) (enu. Current );
Console. Write (match. Value + "\ r \ n ");
}
}
}
}
Some crawlers use similar methods in HTML parsing.
Method 2:
Use Winista. Htmlparser. Net to parse Html. This is the open source code for parsing Html on the. NET platform. If the source code is downloaded from the Internet, Baidu will be able to find it. It will not be provided here. And there are help documents in English. The mailbox cannot be found.
I personally think this is a good solution for parsing html on the. net platform, which basically satisfies our parsing work on html.
I made an instance myself:
Code
Using System;
Using System. Collections. Generic;
Using System. ComponentModel;
Using System. Data;
Using System. Drawing;
Using System. Linq;
Using System. Text;
Using System. Windows. Forms;
Using Winista. Text. HtmlParser;
Using Winista. Text. HtmlParser. Lex;
Using Winista. Text. HtmlParser. Util;
Using Winista. Text. HtmlParser. Tags;
Using Winista. Text. HtmlParser. Filters;
Namespace HTMLParser
{
Public partial class Form1: Form
{
Public Form1 ()
{
InitializeComponent ();
Addurl ();
}
P rivate void btnparser_click (Object sender, eventargs E)
{
# Region
Try
{
Txthtmlwhole. Text = "";
String url = cburl. selecteditem. tostring (). Trim ();
System. Net. WebClient aWebClient = new System. Net. WebClient ();
AWebClient. Encoding = System. Text. Encoding. Default;
String html = aWebClient. DownloadString (url );
TxtHtmlWhole. Text = html;
}
Catch (Exception ex)
{
MessageBox. Show (ex. Message );
}
# Endregion
# Region analyze webpage html nodes
Lexer lexer = new Lexer(this.txt HtmlWhole. Text );
Parser parser = new Parser (lexer );
NodeList htmlNodes = parser. Parse (null );
This. treeView1.Nodes. Clear ();
This. treeView1.Nodes. Add ("root ");
TreeNode treeRoot = this. treeView1.Nodes [0];
For (int I = 0; I {
This. RecursionHtmlNode (treeRoot, htmlNodes [I], false );
}
# Endregion
}
P rivate void RecursionHtmlNode (TreeNode treeNode, INode htmlNode, bool siblingRequired)
{
If (htmlNode = null | treeNode = null) return;
TreeNode current = treeNode;
TreeNode content;
// Current node
If (htmlNode is ITag)
{
ITag tag = (htmlNode as ITag );
If (! Tag. IsEndTag ())
{
String nodeString = tag. TagName;
If (tag. Attributes! = Null & tag. Attributes. Count> 0)
{
If (tag. Attributes ["ID"]! = Null)
{
NodeString = nodeString + "{id = \" "+ tag. Attributes [" ID "]. ToString () + "\"}";
}
If (tag. Attributes ["HREF"]! = Null)
{
NodeString = nodeString + "{href = \" "+ tag. Attributes [" HREF "]. ToString () + "\"}";
}
}
Current = new TreeNode (nodeString );
Treenode. nodes. Add (current );
}
}
// Obtain the content between nodes
If (htmlnode. Children! = NULL & htmlnode. Children. Count> 0)
{
This. recursionhtmlnode (current, htmlnode. firstchild, true );
Content = new treenode (htmlnode. firstchild. gettext ());
Treenode. nodes. Add (content );
}
// The sibling nodes
If (siblingrequired)
{
Inode sibling = htmlnode. nextsibling;
While (sibling! = NULL)
{
This. recursionhtmlnode (treenode, sibling, false );
Sibling = sibling. nextsibling;
}
}
}
* ****** Void addurl ()
{
Cburl. Items. Add ("http://www.hao123.com ");
CBUrl. Items. Add ("http://www.sina.com ");
CBUrl. Items. Add ("http://www.heuet.edu.cn ");
}
}
}
Running effect:
It is easy to achieve the desired effect by combining the Winista. Htmlparser source code.
Summary:
I have briefly introduced two methods for parsing Html. I hope you can find any other good methods.