思路:
擷取字串裡面中的Unicode部分,然後將該部分轉換位utf-8格式的字元,最後將字串裡面的所有Unicode替換為utf-8即可。
廢話不多少,直接上代碼:
標頭檔:
/* * charsetEncode.h * * Created on: Jul 25, 2016 * Author: root */#ifndef COMMONSERVER_INCLUDE_CHARSETENCODE_H_#define COMMONSERVER_INCLUDE_CHARSETENCODE_H_#include <iostream>#include <algorithm>#include <string>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include <errno.h>#include <assert.h>using namespace std;class CcharsetEncode{public: int unicode_to_utf8(string &source);//unicode to utf-8 //字串忽略大小寫字串替換 void ReplaceStr(string &strContent, const char *strSrc, const char *strDest);private: int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize); int isUnicode(const string &src); //the src is unicode or not, total 6 char(0x5e3f).1,yes;2,no unsigned int xstrtoshortint(const char *str); //"0x1a3f"->1a3f};#endif /* COMMONSERVER_INCLUDE_CHARSETENCODE_H_ */
源檔案:
/* * charsetEncode.cpp * * Created on: Jul 25, 2016 * Author: root */#include "charsetEncode.h"int CcharsetEncode::unicode_to_utf8(string &source){int sourcesize = source.size();string src;unsigned char pout[8];for(int index = 0; index < sourcesize - 6;){memset(pout, 0, 8);src = source.substr(index, 6);if(isUnicode(src) == 1){string hexsrc = source.substr(index + 2, 4);int num = enc_unicode_to_utf8_one(xstrtoshortint(hexsrc.c_str()), pout, 8);ReplaceStr(source, src.c_str(), (char *)pout);index += 3;sourcesize = source.size();}else{index++;}}return 0;}int CcharsetEncode::enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize){ assert(pOutput != NULL); assert(outSize >= 6); if ( unic <= 0x0000007F ) { // * U-00000000 - U-0000007F: 0xxxxxxx *pOutput = (unic & 0x7F); return 1; } else if ( unic >= 0x00000080 && unic <= 0x000007FF ) { // * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx *(pOutput+1) = (unic & 0x3F) | 0x80; *pOutput = ((unic >> 6) & 0x1F) | 0xC0; return 2; } else if ( unic >= 0x00000800 && unic <= 0x0000FFFF ) { // * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx *(pOutput+2) = (unic & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 6) & 0x3F) | 0x80; *pOutput = ((unic >> 12) & 0x0F) | 0xE0; return 3; } else if ( unic >= 0x00010000 && unic <= 0x001FFFFF ) { // * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx *(pOutput+3) = (unic & 0x3F) | 0x80; *(pOutput+2) = ((unic >> 6) & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80; *pOutput = ((unic >> 18) & 0x07) | 0xF0; return 4; } else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF ) { // * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx *(pOutput+4) = (unic & 0x3F) | 0x80; *(pOutput+3) = ((unic >> 6) & 0x3F) | 0x80; *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80; *pOutput = ((unic >> 24) & 0x03) | 0xF8; return 5; } else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF ) { // * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx *(pOutput+5) = (unic & 0x3F) | 0x80; *(pOutput+4) = ((unic >> 6) & 0x3F) | 0x80; *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80; *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80; *pOutput = ((unic >> 30) & 0x01) | 0xFC; return 6; } return 0;}int CcharsetEncode::isUnicode(const string &src){if(src.size() != 6)return 0;if(src.find("\\u", 0) == 0){for(int i = 2; i <= 5; i++){if(!((src[i] >= 'a' && src[i] <= 'f')|| (src[i] >= 'A' && src[i] <= 'F')|| (src[i] >= '0' && src[i] <= '9'))){return 0;}}return 1;}else{return 0;}}unsigned int CcharsetEncode::xstrtoshortint(const char *str){ int len = strlen(str); unsigned int ivalue = 0; for (int i = 0; i < len; i++) { if ((str[i] <= '9' && str[i] >= '0')) { ivalue = ivalue * 16 + (str[i] - '0'); //16進位 可換其它進位 } else if ((str[i] >= 'a' && str[i] <= 'f')) { ivalue = ivalue * 16 + (str[i] - 'a') + 10; } else if ((str[i] >= 'A' && str[i] <= 'F')) { ivalue = ivalue * 16 + (str[i] - 'A') + 10; } } return ivalue;}void CcharsetEncode::ReplaceStr(string &strContent, const char *strSrc, const char *strDest){ string strCopy(strContent); string strSrcCopy(strSrc); string::size_type pos = 0; string::size_type srclen = strlen(strSrc); if( (pos=strCopy.find(strSrcCopy, pos)) != string::npos) { strContent.replace(pos, srclen, strDest); }}
主函數測試:
int main(){CcharsetEncode encode;string src = "\u300a\u58eb\u5175\u7a81\u51fb\u300b";encode.unicode_to_utf8(src);cout<<" unicode: "<<src<<endl;return 0;}