namespace Microshaoft
{
using System;
using System.IO;
using System.Net;
#region Class IdentifyEncoding.....
/// <summary>
/// 檢測字元編碼的類
/// <seealso cref="Stream"/>
/// <seealso cref="Uri"/>
/// <seealso cref="FileInfo"/>
/// </summary>
/// <remarks>
/// <![CDATA[
/// <strong>IdentifyEncoding</strong> 用來檢測 <see cref="Uri"/>,<see cref="FileInfo"/>,<see cref="sbyte"/> 位元組數組的編碼.
/// Create By lion <br/>
/// 2005-02-21 22:00 <br/>
/// Support .Net Framework v1.1.4322 <br/>
/// WebSite:www.lionsky.net(lion-a AT sohu.com) <br/>
/// ]]>
/// </remarks>
public class IdentifyEncoding
{
#region Fields.....
// Frequency tables to hold the GB, Big5, and EUC-TW character
// frequencies
internal static int[][] GBFreq = new int[94][];
internal static int[][] GBKFreq = new int[126][];
internal static int[][] Big5Freq = new int[94][];
internal static int[][] EUC_TWFreq = new int[94][];
internal static string[] nicename = new string[]
{
"GB2312", "GBK", "HZ", "Big5", "CNS 11643"
, "ISO 2022CN", "UTF-8", "Unicode", "ASCII", "OTHER"
};
#endregion
#region Methods.....
/// <summary>
/// 初始化 <see cref="IdentifyEncoding"/> 的執行個體
/// </summary>
public IdentifyEncoding()
{
Initialize_Frequencies();
}
#region GetEncodingName.....
/// <summary>
/// 從指定的 <see cref="Uri"/> 中判斷編碼類別型
/// </summary>
/// <param name="testurl">要判斷的 <see cref="Uri"/> </param>
/// <returns>返回編碼類別型("GB2312", "GBK", "HZ", "Big5", "CNS 11643", "ISO 2022CN", "UTF-8", "Unicode", "ASCII", "OTHER")</returns>
/// <example>
/// 以下樣本示範了如何調用 <see cref="GetEncodingName"/> 方法:
/// <code>
/// IdentifyEncoding ide = new IdentifyEncoding();
/// Response.Write(ide.GetEncodingName(new Uri("http://china5.nikkeibp.co.jp/china/news/com/200307/pr_com200307170131.html")));
/// </code>
/// </example>
public virtual string GetEncodingName(Uri testurl)
{
sbyte[] rawtext = new sbyte[1024];
int bytesread = 0, byteoffset = 0;
Stream chinesestream;
try
{
chinesestream = WebRequest.Create(testurl.AbsoluteUri).GetResponse().GetResponseStream();
while ((bytesread = ReadInput(chinesestream, ref rawtext, byteoffset, rawtext.Length - byteoffset)) > 0)
{
byteoffset += bytesread;
}
chinesestream.Close();
}
catch (Exception e)
{
Console.Error.WriteLine("Error loading or using URL " + e.ToString());
}
return GetEncodingName(rawtext);
}
/// <summary>
/// 從指定的 <see cref="FileInfo"/> 中判斷編碼類別型
/// </summary>
/// <param name="testfile">要判斷的 <see cref="FileInfo"/> </param>
/// <returns>返回編碼類別型("GB2312", "GBK", "HZ", "Big5", "CNS 11643", "ISO 2022CN", "UTF-8", "Unicode", "ASCII", "OTHER")</returns>
/// <example>
/// 以下樣本示範了如何調用 <see cref="GetEncodingName"/> 方法:
/// <code>
/// IdentifyEncoding ide = new IdentifyEncoding();
/// Response.Write(ide.GetEncodingName(new FileInfo(@"C:\test.txt")));
/// </code>
/// </example>
public virtual string GetEncodingName(FileInfo testfile)
{
FileStream chinesefile;
sbyte[] rawtext;
rawtext = new sbyte[(int)FileLength(testfile)];
try
{
chinesefile = new FileStream(testfile.FullName, FileMode.Open, FileAccess.Read);
ReadInput(chinesefile, ref rawtext, 0, rawtext.Length);
}
catch (Exception e)
{
Console.Error.WriteLine("Error: " + e);
}
return GetEncodingName(rawtext);
}
/// <summary>
/// 從指定的 <see cref="sbyte"/> 位元組數組中判斷編碼類別型
/// </summary>
/// <param name="rawtext">要判斷的 <see cref="FileInfo"/> </param>
/// <returns>返回編碼類別型("GB2312", "GBK", "HZ", "Big5", "CNS 11643", "ISO 2022CN", "UTF-8", "Unicode", "ASCII", "OTHER")</returns>
/// <example>
/// 以下樣本示範了如何調用 <see cref="GetEncodingName"/> 方法:
/// <code>
/// IdentifyEncoding ide = new IdentifyEncoding();
/// Response.Write(ide.GetEncodingName(IdentifyEncoding.ToSByteArray(System.Text.Encoding.GetEncoding("gb2312").GetBytes("Lion互動網路(www.lionsky.net)"))));
/// </code>
/// </example>
public virtual string GetEncodingName(sbyte[] rawtext)
{
int[] scores;
int index, maxscore = 0;
int encoding_guess = 0;
scores = new int[10];
//分析編碼的機率
scores[0] = GB2312Probability(rawtext);
scores[1] = GBKProbability(rawtext);
scores[2] = HZProbability(rawtext);
scores[3] = BIG5Probability(rawtext);
scores[4] = ENCTWProbability(rawtext);
scores[5] = ISO2022CNProbability(rawtext);
scores[6] = UTF8Probability(rawtext);
scores[7] = UnicodeProbability(rawtext);
scores[8] = ASCIIProbability(rawtext);
scores[9] = 0;
// Tabulate Scores
for (index = 0; index < 10; index++)
{
if (scores[index] > maxscore)
{
encoding_guess = index;
maxscore = scores[index];
}
}
// Return OTHER if nothing scored above 50
if (maxscore <= 50)
{
encoding_guess = 9;
}
return nicename[encoding_guess];
}
#endregion
#region About Probability.....
#region GB2312Probability
/// <summary>
/// 判斷是GB2312編碼的可能性
/// </summary>
/// <param name="rawtext">要判斷的 <see cref="sbyte"/> 位元組數組</param>
/// <returns>返回 0 至 100 之間的可能性</returns>
internal virtual int GB2312Probability(sbyte[] rawtext)
{
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.Length;
for (i = 0; i < rawtextlen - 1; i++)
{
if (rawtext[i] >= 0)
{
//asciichars++;
}
else
{
dbchars++;
if ((sbyte)Identity(0xA1) <= rawtext[i] && rawtext[i] <= (sbyte)Identity(0xF7) && (sbyte)Identity(0xA1) <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte)Identity(0xFE))
{
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (GBFreq[row][column] != 0)
{
gbfreq += GBFreq[row][column];
}
else if (15 <= row && row < 55)
{
gbfreq += 200;
}
}
i++;
}
}
rangeval = 50 * ((float)gbchars / (float)dbchars);
freqval = 50 * ((float)gbfreq / (float)totalfreq);
return (int)(rangeval + freqval);
}
#endregion
#region GBKProbability.....
/// <summary>
/// 判斷是GBK編碼的可能性
/// </summary>
/// <param name="rawtext">要判斷的 <see cref="sbyte"/> 位元組數組</param>
/// <returns>返回 0 至 100 之間的可能性</returns>
internal virtual int GBKProbability(sbyte[] rawtext)
{
int i, rawtextlen = 0;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int row, column;
// Stage 1: Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.Length;
for (i = 0; i < rawtextlen - 1; i++)
{
if (rawtext[i] >= 0)
{
//asciichars++;
}
else
{
dbchars++;
if ((sbyte)Identity(0xA1) <= rawtext[i] && rawtext[i] <= (sbyte)Identity(0xF7) && (sbyte)Identity(0xA1) <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte)Identity(0xFE))
{
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (GBFreq[row][column] != 0)
{
gbfreq += GBFreq[row][column];
}
else if (15 <= row && row < 55)
{
gbfreq += 200;
}
}
else if ((sbyte)Identity(0x81) <= rawtext[i] && rawtext[i] <= (sbyte)Identity(0xFE) && (((sbyte)Identity(0x80) <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte)Identity(0xFE)) || ((sbyte)0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte)0x7E)))
{
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0x81;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E)
{
column = rawtext[i + 1] - 0x40;
}
else
{
column = rawtext[i + 1] + 256 - 0x80;
}
if (GBKFreq[row][column] != 0)
{
gbfreq += GBKFreq[row][column];
}
}
i++;
}
}
rangeval = 50 * ((float)gbchars / (float)dbchars);
freqval = 50 * ((float)gbfreq / (float)totalfreq);
return (int)(rangeval + freqval) - 1;
}
#endregion
#region HZProbability.....
/// <summary>
/// 判斷是HZ編碼的可能性
/// </summary>
/// <param name="rawtext">要判斷的 <see cref="sbyte"/> 位元組數組</param>
/// <returns>返回 0 至 100 之間的可能性</returns>
internal virtual int HZProbability(sbyte[] rawtext)
{
int i, rawtextlen;
int hzchars = 0, dbchars = 1;
long hzfreq = 0, totalfreq = 1;
float rangeval = 0, freqval = 0;
int hzstart = 0, hzend = 0;
int row, column;
rawtextlen = rawtext.Length;
for (i = 0; i < rawtextlen; i++)
{
if (rawtext[i] == '~')
{
if (rawtext[i + 1] == '{')
{
hzstart++;
i += 2;
while (i < rawtextlen - 1)
{
if (rawtext[i] == 0x0A || rawtext[i] == 0x0D)
{
break;
}
else if (rawtext[i] == '~' && rawtext[i + 1] == '}')
{
hzend++;
i++;
break;
}
else if ((0x21 <= rawtext[i] && rawtext[i] <= 0x77) && (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77))
{
hzchars += 2;
row = rawtext[i] - 0x21;
column = rawtext[i + 1] - 0x21;
totalfreq += 500;
if (GBFreq[row][column] != 0)
{
hzfreq += GBFreq[row][column];
}
else if (15 <= row && row < 55)
{
hzfreq += 200;
}
}
else if (((byte)0xA1 <= rawtext[i] && rawtext[i] <= (byte)0xF7) && ((byte)0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte)0xF7))
{
hzchars += 2;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
totalfreq += 500;
if (GBFreq[row][column] != 0)
{
hzfreq += GBFreq[row][column];
}
else if (15 <= row && row < 55)
{
hzfreq += 200;
}
}
dbchars += 2;
i += 2;
}
}
else if (rawtext[i + 1] == '}')
{
hzend++;
i++;
}
else if (rawtext[i + 1] == '~')
{
i++;
}
}
}
if (hzstart > 4)
{
rangeval = 50;
}
else if (hzstart > 1)
{
rangeval = 41;
}
else if (hzstart > 0)
{
// Only 39 in case the sequence happened to occur
rangeval = 39; // in otherwise non-Hz text
}
else
{
rangeval = 0;
}
freqval = 50 * ((float)hzfreq / (float)totalfreq);
return (int)(rangeval + freqval);
}
#endregion
#region BIG5Probability.....
/// <summary>
/// 判斷是BIG5編碼的可能性
/// </summary>
/// <param name="rawtext">要判斷的 <see cref="sbyte"/> 位元組數組</param>
/// <returns>返回 0 至 100 之間的可能性</returns>
internal virtual int BIG5Probability(sbyte[] rawtext)
{
int i, rawtextlen = 0;
int dbchars = 1, bfchars = 1;
float rangeval = 0, freqval = 0;
long bffreq = 0, totalfreq = 1;
int row, column;
// Check to see if characters fit into acceptable ranges
rawtextlen = rawtext.Length;
for (i = 0; i < rawtextlen - 1; i++)
{
if (rawtext[i] >= 0)
{
//asciichars++;
}
else
{
dbchars++;
if ((sbyte)Identity(0xA1) <= rawtext[i] && rawtext[i] <= (sbyte)Identity(0xF9) && (((sbyte)0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (sbyte)0x7E) || ((sbyte)Ident