Today, I encountered a problem. I need to try to obtain the text encoding from a file. It's not difficult. It's nothing more than determining the Character Sequence mark (BOM) based on the first few bytes. If the file does not contain Bom, a default value is returned.
Code:
/// <Summary>
/// Returns the character encoding Based on the object Attempt
/// </Summary>
/// <Param name = "file"> file path </param>
/// <Param name = "defenc"> No Default Code returned by BOM </param>
/// <Returns> If the object cannot be read, null is returned. Otherwise, the encoding or default encoding Based on BOM is returned (no Bom is available ). </Returns>
Static encoding getencoding (string file, encoding defenc)
{
Using (VAR stream = file. openread (File ))
{
// Judge whether the stream is readable?
If (! Stream. Canread)
Return NULL;
// Byte array storage BOM
VaR BOM = new byte [4];
// Actual read Length
Int readc;
Readc = stream. Read (Bom, 0, 4 );
If (readc> = 2)
{
If (readc> = 4)
{
// UTF32, big-Endian
If (checkbytes (Bom, 4, 0x00, 0x00, 0xfe, 0xff ))
Return new utf32encoding (True, true );
// UTF32, little-Endian
If (checkbytes (Bom, 4, 0xff, 0xfe, 0x00, 0x00 ))
Return new utf32encoding (false, true );
}
// Utf8
If (readc> = 3 & checkbytes (Bom, 3, 0xef, 0xbb, 0xbf ))
Return new utf8encoding (true );
// UTF16, big-Endian
If (checkbytes (Bom, 2, 0xfe, 0xff ))
Return new unicodeencoding (True, true );
// UTF16, little-Endian
If (checkbytes (Bom, 2, 0xff, 0xfe ))
Return new unicodeencoding (false, true );
}
Return defenc;
}
}
// Auxiliary function to determine the value in the byte
Static bool checkbytes (byte [] bytes, int count, Params int [] values)
{
For (INT I = 0; I <count; I ++)
If (Bytes [I]! = Values [I])
Return false;
Return true;
}
For example, use the above Code to check the encoding information of all TXT files in the "My Documents" directory:
Foreach (var file in directory. getfiles (environment. getfolderpath (environment. specialfolder. mydocuments), "*. txt "))
Console. writeline ("{0} \ n {1} \ n", file, getencoding (file, encoding. ASCII). encodingname );