在做Clucene與lucene產生的Index檔案相互相容時,遇到了編碼轉換問題。它們的相容性對於非英文的編碼可能都會存在這樣的問題,經過跟蹤clucene程式,發現它用的是unicode編碼方式儲蓄,因此,要先把字串或檔案轉換成unicode編碼,然後再進行其它處理。
轉換的具體代碼如下(Linux與vc6.0測試通過):
#ifndef _UNIX
static inline int codepage(const char* code_page)
{
return 936;//"GBK"
}
#endifstatic inline int mb2wc(const char* code_page,/*in*/const char* in,int in_len,
/*out*/wchar_t* out,int out_max)
{
#ifdef _UNIX
size_t result;
iconv_t env;
env = iconv_open("WCHAR_T",code_page);
result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
iconv_close(env);
return (int) result;
#else
return ::MultiByteToWideChar(codepage(code_page),0,in,in_len,out,out_max);
#endif
}static inline int wc2mb(const char* code_page,/*in*/const wchar_t* in,int in_len,
/*out*/char* out,int out_max)
{
#ifdef _UNIX
size_t result;
iconv_t env;
env = iconv_open(code_page,"WCHAR_T");
result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
iconv_close(env);
return (int) result;
#else
return ::WideCharToMultiByte(codepage(code_page),0,in,-1,out,out_max, NULL, NULL);
#endif
}void str_to_UnicodeChar(const char* strIn,TCHAR* &strOut){
if(!strIn)
return; int i= mb2wc("936",(char*)strIn, -1, NULL, 0);
strOut = (TCHAR*)malloc(sizeof(TCHAR)*i);
mb2wc("936",(char*)strIn, -1, strOut, i);
}
void UnicodeChar_to_str(const TCHAR* strIn,char* &strOut){
if(!strIn)
return;
int i = wc2mb("936",strIn,-1,NULL,0);
strOut = new char[i+1];
wc2mb("936", strIn, -1, strOut, i);
strOut[i] = 0;
}void tchar_to_str(const const TCHAR* strIn ,char* &strOut){
int i=0;
if(!strIn)
return ;
strOut = new char[1024];
while(*strIn) {
strOut[i]=*strIn++;
i++;
}
strOut[i]='\0';
}