Using system;
Using system. xml;
Using system. text;
Using system. net;
Using system. IO;
Using system. collections;
Using system. Text. regularexpressions;
Public class app
{
Public static void main ()
{
String strcode;
Arraylist allinks;
Console. Write ("enter a webpage address :");
String strurl = console. Readline ();
If (strurl. substring (0, 7 )! = @ "Http ://")
{
Strurl = @ "http: //" + strurl;
}
Console. writeline ("retrieving page...Code, Please wait ...");
Strcode = getpagesource (strurl );
Console. writeline ("extracting hyperlinks, please wait ...");
Allinks = gethyperlinks (strcode );
Console. writeline ("Writing files, please wait ...");
Writetoxml (strurl, allinks );
}
// Obtain the HTML code of the specified webpage
Static string getpagesource (string URL)
{
Uri uri = new uri (URL );
Httpwebrequest hwreq = (httpwebrequest) webrequest. Create (URI );
Httpwebresponse hwres = (httpwebresponse) hwreq. getresponse ();
Hwreq. method = "get ";
Hwreq. keepalive = false;
Streamreader reader = new streamreader (hwres. getresponsestream (), system. Text. encoding. getencoding ("gb2312 "));
Return reader. readtoend ();
}
// Extract the URL from the HTML code
Static arraylist gethyperlinks (string htmlcode)
{
Arraylist Al = new arraylist ();
String strregex = @ "http: // ([\ W-] + \.) + [\ W-] + (/[\ W -./? % & =] *)? ";
RegEx r = new RegEx (strregex, regexoptions. ignorecase );
Matchcollection M = R. Matches (htmlcode );
For (INT I = 0; I <= M. Count-1; I ++)
{
Bool rep = false;
String strnew = m [I]. tostring ();
// Filter duplicate URLs
Foreach (string STR in Al)
{
If (strnew = Str)
{
Rep = true;
Break;
}
}
If (! Rep) Al. Add (strnew );
}
Al. Sort ();
Return al;
}
// Write the URL to an XML file
Static void writetoxml (string strurl, arraylist alhyperlinks)
{
Xmltextwriter writer = new xmltextwriter ("hyperlinks. xml", encoding. utf8 );
Writer. Formatting = formatting. indented;
Writer. writestartdocument (false );
Writer. writedoctype ("hyperlinks", null, "URLs. DTD", null );
Writer. writecomment ("extracted from" + strurl + "HYPERLINK ");
Writer. writestartelement ("hyperlinks ");
Writer. writestartelement ("hyperlinks", null );
Writer. writeattributestring ("datetime", datetime. Now. tostring ());
Foreach (string STR in alhyperlinks)
{
String title = getdomain (STR );
String body = STR;
Writer. writeelementstring (title, null, body );
}
Writer. writeendelement ();
Writer. writeendelement ();
Writer. Flush ();
Writer. Close ();
}
// Obtain the domain name suffix of the website
Static string getdomain (string strurl)
{
String retval;
String strregex = @ "(\. com/| \. Net/| \. CN/| \. org/| \. GOV /)";
RegEx r = new RegEx (strregex, regexoptions. ignorecase );
Match m = R. Match (strurl );
Retval = M. tostring ();
Strregex = @ "\. |/$ ";
Retval = RegEx. Replace (retval, strregex, ""). tostring ();
If (retval = "")
Retval = "other ";
Return retval;
}
}
This article from the csdn blog, reproduced please indicate the source: http://blog.csdn.net/21aspnet/archive/2007/03/24/1540012.aspx