Links | web | web
This article introduces web spiders to get all the links in the Web page method, the principle of implementation: Use the System.Net.WebClient class to get the content of the remote Web page, and then use the URL regular expression analysis of the links in the HTML code. The code is as follows:
Using System;
Using System.Net;
Using System.Text;
Using System.Text.RegularExpressions;
Namespace HttpGet
{
Class Class1
{
[STAThread]
static void Main (string[] args)
{
System.Net.WebClient client = new WebClient ();
byte[] page = client. Downloaddata ("http://news.163.com");
String content = System.Text.Encoding.UTF8.GetString (page);
String regex = "href=[\\\" \\\ '] (http:\\/\\/|\\.\\/|\\/)? \\w+ (\\.\\w+) * (\\/\\w+ (\\.\\w+)?) * (\\/|\\?\\w*=\\w* (&\\w*=\\w*) *)? [\\\"\\\']";
Regex re = new regex (regex);
MatchCollection matches = Re. Matches (content);
System.Collections.IEnumerator enu = matches. GetEnumerator ();
while (ENU. MoveNext () && enu. Current!= null)
{
Match match = (match) (ENU. Current);
Console.Write (match. Value + "\ r \ n");
}
}
}
}