1. Description
If you do system programming on windows, you will inevitably encounter the problem of processing Chinese strings. Most of the time, Chinese characters are displayed in multi-byte encoding. To achieve better compatibility or some special requirements (such as displaying on a webpage .) It is often necessary to convert it to unicode or utf8 format.
2. Sample code
2.1 convert a Chinese string to Unicode
/*************************************** *********************************
* Int CN2Unicode (char * input, wchar_t * output)
* Function: convert Chinese characters to unicode characters
* Parameter: input, a Chinese string, output, and Unicode string
*
**************************************** *********************************/
Int CN2Unicode (char * input, wchar_t * output)
{
Int len = strlen (input );
// Wchar_t * out = (wchar_t *) malloc (len * sizeof (wchar_t ));
Len = MultiByteToWideChar (CP_ACP, 0, input,-1, output, MAX_PATH );
Return 1;
}
2.2 convert a Chinese string to utf8
/*************************************** *********************************
* Int CN2Utf8 (char * input, char * output)
* Function: converts a Chinese string to a UTF-8 string.
* Parameter: input, a Chinese string, output, and utf8 string
*
**************************************** ********************************/
Int CN2Utf8 (char * input, char * output)
{
Int len;
Wchar_t * out = (wchar_t *) malloc (len * sizeof (wchar_t ));
Len = MultiByteToWideChar (CP_ACP, 0, input,-1, out, strlen (input) + 1 );
WideCharToMultiByte (CP_UTF8, 0, out, wcslen (out), output, len, NULL, NULL );
Return 1;
}
C/C ++ Unicode to Utf8, Ansi to Unicode, and Ansi to Utf8
Sometimes you need to convert the ansi file content to utf8 encoding. After reading a row, convert the ansi string to utf8 and then write it into the file.
# Include <stdio. h>
# Include <stdlib. h>
# Include <string. h>
# Include <windows. h>
# Include <assert. h>
Char * Unicode2Utf8 (const char * unicode)
{
Int len;
Len = WideCharToMultiByte (CP_UTF8, 0, (const wchar_t *) unicode,-1, NULL, 0, NULL, NULL );
Char * szUtf8 = (char *) malloc (len + 1 );
Memset (szUtf8, 0, len + 1 );
WideCharToMultiByte (CP_UTF8, 0, (const wchar_t *) unicode,-1, szUtf8, len, NULL, NULL );
Return szUtf8;
}
Char * Ansi2Unicode (const char * str)
{
Int dwUnicodeLen = MultiByteToWideChar (CP_ACP, 0, str,-1, NULL, 0 );
If (! DwUnicodeLen)
{
Return strdup (str );
}
Size_t num = dwUnicodeLen * sizeof (wchar_t );
Wchar_t * pwText = (wchar_t *) malloc (num );
Memset (pwText, 0, num );
MultiByteToWideChar (CP_ACP, 0, str,-1, pwText, dwUnicodeLen );
Return (char *) pwText;
}
Char * ConvertAnsiToUtf8 (const char * str)
{
Char * unicode = Ansi2Unicode (str );
Char * utf8 = Unicode2Utf8 (unicode );
Free (unicode );
Return utf8;
}
Int main (int argc, char * argv [])
{
Printf ("Hello, world \ n ");
// 1. Construct an ansi file with the content "Chinese abc". Check the hex encoding.
// Ansi: D6 D0 CE C4 61 62 63
// Utf8: E4 B8 AD E6 96 87 61 62 63
Char ansi [] = {0xD6, 0xD0, 0xCE, 0xC4, 0x61,0x62,0x63,0 };
Char utf8 [] = {0xE4, 0xB8, 0xAD, 0xE6, 0x96,0x87,0x61,0x62,0x63,0 };
Char * str = ConvertAnsiToUtf8 (ansi );
Assert (! Strcmp (str, utf8 ));
Free (str );
Return 0;
}