Http://www.cnblogs.com/yuandong/archive/2008/08/28/Web_Spider_Url_Index.html
After reading this article, I was inspired by C ++. I implemented it in C .. Not to mention that much, post code ..
Determine the URLIdentity class, used to identify the crawled URL and determine whether the URL has been captured
Using system;
Using system. Collections. Generic;
Using system. LINQ;
Using system. text;
Using system. collections;
Using system. Security. cryptography;
Namespace TestMD5
{
Public class URLIdentity
{
Private BitArray [] SegmentArray = new BitArray [4096];
Private int GetSegmentIndex (int hashValue)
{
Return hashValue/25000;
}
Private int GetSegmentOffset (int hashValue)
{
Return hashValue % 25000;
}
Public int GetIntHashCode (string url)
{
Byte [] tmpByte;
MD5 md5 = new MD5CryptoServiceProvider ();
TmpByte = md5.ComputeHash (Encoding. Default. GetBytes (url ));
StringBuilder sb = new StringBuilder ();
For (int I = 0; I <tmpByte. Length; I ++)
{
Sb. Append (tmpByte [I]);
}
String temp = sb. ToString (). Substring (0, 8 );
Return Int32.Parse (temp );
}
Public void SetUrlIndentity (string url)
{
Int HashNum = GetIntHashCode (url );
Int SegIndex = GetSegmentIndex (HashNum );
Int SegOffset = GetSegmentOffset (HashNum );
If (SegmentArray [SegIndex] = null)
{
SegmentArray [SegIndex] = new BitArray (25000 );
}
SegmentArray [SegIndex] [SegOffset] = true;
}
Public bool GetUrlIdentity (string url)
{
Int HashNum = GetIntHashCode (url );
Int SegIndex = GetSegmentIndex (HashNum );
Int SegOffset = GetSegmentOffset (HashNum );
If (SegmentArray [SegIndex] = null)
{
Return false;
}
Else
{
Return segmentarray [segindex] [segoffset];
}
}
}
}
Demo:
Code
Using system;
Using system. Collections. Generic;
Using system. LINQ;
Using system. text;
Using system. Security. cryptography;
Using system. Web. Security;
Using system. runtime. interopservices;
Using system. collections;
Namespace testmd5
{
Class Program
{
Static void Main (string [] args)
{
URLIdentity urlIdentity = new URLIdentity ();
String str = "http://www.cnblogs.com ";
UrlIdentity. SetUrlIndentity (str)
If (urlIdentity. GetUrlIdentity (str ))
{
Console. WriteLine ("this url had been crawler ");
}
}
}
}