using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
1.先取得網頁的原代碼
Uri url=new Uri("http://www.blogjava.net/wujun");
HttpWebRequest request=(HttpWebRequest) WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader sr = new StreamReader(stream);
string str=sr.ReadToEnd();
sr.Close();
stream.Close();
response.Close();
得到網頁的html原始碼以後。再根據原始碼分析 所有 <a href ="url"> 最後得到 href後面 url的連結地址
Regex Regex RegExFindHref = new Regex(@"<a\s+([^>]*\s*)?href\s*=\s*(?:""(?<1>[/\a-z0-9_][^""]*)""|'(?<1>[/\a-z0-9_][^']*)'
|(?<1>[/\a-z0-9_]\S*))(\s[^>]*)?>(?<2>.*?)</a>", RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
迴圈讀出 串連地址
for (Match m = RegExFindHref.Match(str); m.Success; m = m.NextMatch())
{
TextBox1.Text+= m.Groups[1].ToString()+"\n";
}
運行後
TextBox1 將顯示分析後的所有網頁的串連 :
http://www.dotlucene.net/
http://www.castleproject.org/
http://www.codeplex.com/
http://www.codeproject.com/
http://www.asp.net/
http://www.nhibernate.org/
http://www.blogjava.net/wujun/CommentsRSS.aspx
http://www.blogjava.net/wujun/archive/2006/10/23/47150.html#76745
http://www.blogjava.net/wujun/archive/2006/10/23.html
http://www.blogjava.net/wujun/archive/2006/10/23/76769.html
http://www.blogjava.net/wujun/archive/2006/10/23/76769.html
http://www.blogjava.net/wujun/archive/2006/10/23/76769.html#FeedBack
http://www.blogjava.net/wujun/admin/EditPosts.aspx?postid=76769
http://www.blogjava.net/wujun/AddToFavorite.aspx?id=76769
http://www.blogjava.net/wujun/archive/2006/10/20.html
......
..............
.........................等等等。。。
代碼
/// <summary>
/// C# :從一段字串中,輸入開始和結束的字元,取中間的字元
/// </summary>
/// <param name="str">一段字串</param>
/// <param name="strStart">開始字元</param>
/// <param name="strEnd">結束字元</param>
/// <returns></returns>
public static string AnalyzeMessage(string str, string strStart, string strEnd)
{
string Result = "";
int i = str.IndexOf(strStart);
if (i >= 0)
{
int j = str.IndexOf(strEnd, i + strStart.Length);
if (j > 0)
{
Result = str.Substring(i + strStart.Length, j - i - strStart.Length);
}
}
return Result;
}