UTF-8: 3 bytes a character
UNICODE: 2 bytes a character
Gb2312: 1 byte one character
Example:
"You" UTF-8 code: e4 BD A0 11100100 10111101 10100000
Your Unicode code: 4f 60 01001111 01100000
According to the UTF-8 coding rules, the decomposition is as follows: xxxx0100 xx111101 xx100000
Concatenates numbers except X into your Unicode code.
Note that the first 3 1 of the UTF-8 indicates that the entire UTF-8 string is composed of 3 bytes.
After UTF-8 encoding, no more sensitive characters, because the highest bit is always 1.
Class Definition
Class cchinesecode
{
Public:
Static void utf_8tounicode (wchar_t * pout, char * ptext); // converts the UTF-8 to Unicode
Static void unicodetoutf_8 (char * pout, wchar_t * ptext); // Unicode to UTF-8
Static void unicodetogb2312 (char * pout, wchar_t udata); // converts Unicode to gb2312
Static void gb2312tounicode (wchar_t * pout, char * gbbuffer); // convert gb2312 to Unicode
Static void gb2312toutf_8 (string & pout, char * ptext, int Plen); // convert gb2312 to UTF-8
Static void utf_8togb2312 (string & pout, char * ptext, int Plen); // convert the UTF-8 to gb2312
};
Class implementation
Void cchinesecode: utf_8tounicode (wchar_t * pout, char * ptext)
{
Char * uchar = (char *) pout;
Uchar [1] = (ptext [0] & 0x0f) <4) + (ptext [1]> 2) & 0x0f );
Uchar [0] = (ptext [1] & 0x03) <6) + (ptext [2] & 0x3f );
Return;
}
Void cchinesecode: unicodetoutf_8 (char * pout, wchar_t * ptext)
{
// Pay attention to the order of wchar high and low characters. The lower byte is in the front and the higher byte is in the back
Char * pchar = (char *) ptext;
Pout [0] = (0xe0 | (pchar [1] & 0xf0)> 4 ));
Pout [1] = (0x80 | (pchar [1] & 0x0f) <2) + (pchar [0] & 0xc0)> 6 );
Pout [2] = (0x80 | (pchar [0] & 0x3f ));
Return;
}
Void cchinesecode: unicodetogb2312 (char * pout, wchar_t udata)
{
Widechartomultibyte (cp_acp, null, & udata, 1, pout, sizeof (wchar_t), null, null );
Return;
}
Void cchinesecode: gb2312tounicode (wchar_t * pout, char * gbbuffer)
{
: Multibytetowidechar (cp_acp, mb_precomposed, gbbuffer, 2, pout, 1 );
Return;
}
Void cchinesecode: gb2312toutf_8 (string & pout, char * ptext, int Plen)
{
Char Buf [4];
Int nlength = Plen * 3;
Char * rst = new char [nlength];
Memset (BUF, 0, 4 );
Memset (RST, 0, nlength );
Int I = 0;
Int J = 0;
While (I <Plen)
{
// If it is an English copy
If (* (ptext + I)> = 0)
{
RST [J ++] = ptext [I ++];
}
Else
{
Wchar_t pbuffer;
Gb2312tounicode (& pbuffer, ptext + I );
Unicodetoutf_8 (BUF, & pbuffer );
Unsigned short int TMP = 0;
TMP = rst [J] = Buf [0];
TMP = rst [J + 1] = Buf [1];
TMP = rst [J + 2] = Buf [2];
J + = 3;
I + = 2;
}
}
RST [J] = '';
// Return results
Pout = RST;
Delete [] RST;
Return;
}
Void cchinesecode: utf_8togb2312 (string & pout, char * ptext, int Plen)
{
Char * newbuf = new char [Plen];
Char ctemp [4];
Memset (ctemp, 0, 4 );
Int I = 0;
Int J = 0;
While (I <Plen)
{
If (ptext> 0)
{
Newbuf [J ++] = ptext [I ++];
}
Else
{
Wchar wtemp;
Utf_8tounicode (& wtemp, ptext + I );
Unicodetogb2312 (ctemp, wtemp );
Newbuf [J] = ctemp [0];
Newbuf [J + 1] = ctemp [1];
I + = 3;
J + = 2;
}
}
Newbuf [J] = '';
Pout = newbuf;
Delete [] newbuf;
Return;
}