I recently learned encoding.
The following addresses can be used to learn related knowledge.
Http://dev.csdn.net/develop/article/69/69883.shtm
Http://dev.csdn.net/develop/article/72/72888.shtm
This section describes the UTF-8 encoding.
When the content to be expressed is 7 bits, use one byte: 0 ********. The first 0 is the flag, the remaining space can represent the content of ASCII 0-127.
When the content to be represented is between 8 and 11 bits, two bytes are used: 110*10*10 of the First and Second bytes is the flag.
When the content to be represented is 12 to 16 bits, it uses three Bytes: 1110*10*10 * Like the above, 1110 of the first byte and 10 of the second and third byte are flags, and the remaining space can represent Chinese characters.
And so on:
Four bytes: 11110*10*10*10*10 ******
Five Bytes: 111110*10*10*10*10*10*10 ******
Six Bytes: 1111110*10*10*10*10*10*10*10 ******
........................................ .....
I wrote the following code for conversion:
Conversion between UCs and utf8
Int ucs2utf8 (wchar_t * pucs, unsigned char * putf8)
{
Int ucslen = 0, utf8len = 0, I;
Unsigned char * ptemputf8 = NULL;
Ucslen = wcslen (pucs );
If (pucs = NULL | putf8 = NULL)
Return-1;
Ptemputf8 = putf8;
For (I = 0; I <ucslen; I ++)
{
If (pucs [I] <= 0x007f) // 1 byte 0 xxxxxxx
{
* (Ptemputf8 ++) = lobyte (pucs [I]);
Utf8len ++;
}
Else if (pucs [I] <= 0x07ff) // 2 bytes 110 XXXXX 10 xxxxxx
{
* (Ptemputf8 ++) = hibyte (pucs [I] <2) & 0x3f | 0xc0;
* (Ptemputf8 ++) = lobyte (pucs [I] & 0x3f) | 0x80;
Utf8len + = 2;
}
Else // 3 bytes 1110 XXXX 10 xxxxxx 10 xxxxxx
{
* (Ptemputf8 ++) = hibyte (pucs [I]> 4) | 0xe0;
* (Ptemputf8 ++) = hibyte (pucs [I] <2) & 0x3f | 0x80;
* (Ptemputf8 ++) = lobyte (pucs [I]) & 0x3f | 0x80;
Utf8len + = 3;
}
}
Return utf8len;
}
Int utf82ucs (unsigned char * putf8, wchar_t * pucs)
{
Int ucslen = 0, I;
Unsigned char * ptempucs = NULL;
Unsigned char * ptemputf8 = NULL;
If (pucs = NULL | putf8 = NULL)
Return-1;
Ucslen = multibytetowidechar (cp_utf8, 0, putf8,-1, null, 0 );
Ucslen --;
Ptempucs = (char *) pucs;
Ptemputf8 = putf8;
For (I = 0; I <ucslen; I ++)
{
If (* ptemputf8) <= 0x7f) // 1 byte
{
* (Ptempucs + 1) = 0x00;
* Ptempucs = * (ptemputf8 ++ );
Ptempucos + = 2;
}
Else if (* ptemputf8)> = 0xc0 & (* ptemputf8) <= 0xdf) // 2 bytes
{
* (Ptempucs + 1) = (* ptemputf8)> 2) & 0x07;
* Ptempucs = (* ptemputf8) <6) | (* (ptemputf8 + 1) & 0x3f );
Ptemputf8 + = 2;
Ptempucos + = 2;
}
Else // 3 bytes
{
* (Ptempucs + 1) = (* ptemputf8) <4) | (* (ptemputf8 + 1)> 2) & 0x0f );
Ptemputf8 ++;
* Ptempucs = (* ptemputf8) <6) | (* (ptemputf8 + 1) & 0x3f );
Ptemputf8 + = 2;
Ptempucos + = 2;
}
}
Return ucslen;
}
Other encoding and UCS Conversion
Int toucs (unsigned char * P, wchar_t * pucs, int codePage)
{
Int Len = 0;
If (pucs = NULL | P = NULL)
Return-1;
Len = multibytetowidechar (codePage, 0, P,-1, null, 0 );
Multibytetowidechar (codePage, 0, P,-1, pucs, Len );
Return Len;
}
Int ucsto (wchar_t * pucs, char * P, int codePage)
{
Int Len = 0;
If (pucs = NULL | pbig5 = NULL)
Return-1;
Len = widechartomultibyte (codePage, 0, pucs,-1, null, 0, null, null );
Widechartomultibyte (codePage, 0, pucs,-1, P, Len, null, null );
Len --;
Return Len;
}
Here codePage is defined as follows in msdn
Bit |
Code Page |
Description |
ANSI |
|
|
0 |
1252 |
Latin 1 |
1 |
1250 |
Latin 2: Eastern Europe |
2 |
1251 |
Cyrillic |
3 |
1253 |
Greek |
4 |
1254 |
Turkish |
5 |
1255 |
Hebrew |
6 |
1256 |
Arabic |
7 |
1257 |
Baltic |
8 |
1258 |
Vietnam |
9-15 |
|
Reserved For ANSI |
ANSI And OEM |
|
|
16 |
874 |
Thai |
17 |
932 |
Japan, shift-JIS |
18 |
936 |
English: Simplified chars-PRC and Singapore |
19 |
949 |
Korean uniied hangeul code (hangeul tonghabhyung code) |
20 |
950 |
Chinese: Traditional chars-Hong Kong SAR, PRC and Taiwan |
21 |
1361 |
Korean (johab) |
22-29 |
|
Reserved for alternate ANSI and OEM |
30-31 |
|
Reserved by system. |
OEM |
|
|
32-46 |
|
Reserved for OEM |
47 |
1258 |
Vietnam |
48 |
869 |
IBM Greek |
49 |
866 |
MS-DOS Russian |
50 |
865 |
MS-DOS Nordic |
51 |
864 |
Arabic |
52 |
863 |
MS-DOS (Canadian French) |
53 |
862 |
Hebrew |
54 |
861 |
MS-DOS Icelandic |
55 |
860 |
MS-DOS (Portuguese) |
56 |
857 |
IBM Turkish |
57 |
855 |
IBM Cyrillic; primarily Russian |
58 |
852 |
Latin 2 |
59 |
775 |
Baltic |
60 |
737 |
Greek; former 437g |
61 |
708 |
Arabic; ASMO 708 |
62 |
850 |
Western European/Latin 1 |
63 |
437 |
Us |