所謂“utf-8”只是UCS Transformation Format,只是UNICODE的一種表現形式,不等同於UNICODE,一般漢字在UNICODE中為兩個(雙)位元組表示,而我們看到實際儲存的文檔確是三個位元組表示一個漢字的,看看下錶:
U-00000000 - U-0000007F: 0xxxxxxx
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
UTF-8是一種變長度的表達方式,一般UNICODE為雙位元組(指UCS2)但為了與以前的ASCII碼相容,ASCII為一個位元組,於是就想出了這種方法,在ASCII碼的範圍用一個位元組表示,超出ASCII碼的範圍就用多位元組表示,這就形成了我們上面看到的UTF-8的表示方法,這樣的好處是當UNICODE文檔中只有ASCII碼時,儲存的文檔都為一個位元組,所以就是普通的ASCII文檔無異,讀入的時候也是如此,所以能與以前的ASCII文檔相容。
至於大於ASCII碼的,就會由上面的第一位元組的前幾位表示該unicode字元的長度,比如110xxxxxx前三位的二進位表示告訴我們這是個2BYTE的UNICODE字元;1110xxxx是個三位的UNICODE字元,依此類推,而首位元組後面的位元組都是以10開頭,見上面這是為了與ASCII碼開頭的0區分告訴我們這是個多位元組UTF-8編碼的後續位。看上面的編碼,我們將上面的x部分重新連起來組成的數值就是實際的UNICODE碼值了(排除10組成的標誌位)。
下面是個我寫的從UTF-8轉換到UNICODE真實值的程式,
#include <stdio.h>
#include <stdlib.h>
typedef unsigned short uint16;
typedef unsigned int uint32;
extern int __cdecl utf2unicode(const char* utf,int utfLen,uint16 **unicode)
{
int s = 0,d= 0;
int size_s;
int size_d;
uint16 *des;
uint32 temp;
uchar c;
if(utf == NULL) return -1;
size_s = utfLen;
size_d = size_s<<1;
des = (uint16*)malloc(size_d);
memset(des, 0, size_d);
while (s < size_s && d < size_d)
{
c = utf[s];
if ((c & 0x80) == 0) //1位
{
des[d++] += (uint16)utf[s++];
}
else if((c & 0xE0) == 0xC0) ///[2位]< 110x-xxxx 10xx-xxxx
{
temp = (uint32)((utf[s + 0] & 0x1F) << 6);
temp |= (uint32)(utf[s + 1] & 0x3F);
temp &=0xFFFF;
des[d++] = (uint16)temp;
s += 2;
}
else if((c & 0xF0) == 0xE0) ///[3位] < 1110-xxxx 10xx-xxxx 10xx-xxxx
{
temp =(((uint32)(utf[s + 0] & 0x0F)) << 12)/*
|(((uint32)(utf[s + 1] & 0x3F)) << 6)
|((uint32)(utf[s + 2] & 0x3F))*/;
temp |= (((uint32)(utf[s + 1] & 0x3F)) << 6);
temp |= ((uint32)(utf[s + 2] & 0x3F));
temp &=0xFFFF;
des[d++] = (uint16)temp;
s += 3;
}
else if((c & 0xF8) == 0xF0) ///[4位] < 1111-0xxx 10xx-xxxx 10xx-xxxx 10xx-xxxx
{
temp = (uint32)((utf[s + 0] & 0x07) << 18);
temp = (uint32)((utf[s + 1] & 0x3F) << 12);
temp |= (uint32)((utf[s + 2] & 0x3F) << 6);
temp |= (uint32)(utf[s + 3] & 0x3F);
temp &=0xFFFF;
des[d++] = (uint16)temp;
s += 4;
}
else if ((c & 0xFC) == 0xF8) // 五位
{
temp = (uint32)((utf[s + 0] & 0x03) << 24);
temp = (uint32)((utf[s + 1] & 0x3F) << 18);
temp = (uint32)((utf[s + 2] & 0x3F) << 12);
temp |= (uint32)((utf[s + 3] & 0x3F) << 6);
temp |= (uint32)(utf[s + 4] & 0x3F);
temp &=0xFFFF;
des[d++] = (uint16)temp;
s += 5;
}
else if ((c & 0xFE) == 0xFC) // 六位
{
temp = (uint32)((utf[s + 0] & 0x01) << 30);
temp |= (uint32)((utf[s + 1] & 0x3F) << 24);
temp |= (uint32)((utf[s + 2] & 0x3F) << 18);
temp |= (uint32)((utf[s + 3] & 0x3F) << 12);
temp |= (uint32)((utf[s + 4] & 0x3F) << 6);
temp |= (uint32)(utf[s + 5] & 0x3F);
temp &=0xFFFF;
des[d++] = (uint16)temp;
s += 6;
}
}
d <<=1;
*unicode = (uint16*)malloc(d);
memcpy(*unicode,des,d);
free(des);
return (d>>1);
}