Tesseract-ocr的dll tesseract-3.02.02-win32-lib-include-dirs:
Google-Tesseract-OCR
解壓之後包括 include檔案夾和Lib檔案夾,配置標頭檔和庫檔案,建立工程進行測試。
#include "baseapi.h"
#include "strngs.h"
#pragma comment(lib,"libtesseract302.lib")
char * str = "test.jpg";tesseract::TessBaseAPI api; api.Init(NULL, "chi_sim", tesseract::OEM_DEFAULT); //初始化,設定語言套件,中文簡體:chi_sim;英文:eng;也可以自己訓練語言套件//api.SetVariable( "tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" ); STRING text_out; if (!api.ProcessPages(str, NULL, 0, &text_out)) { return 0; }
上邊輸出text_out.string(),結果為utf-8編碼格式,因此需要轉碼,寫了一個轉成GBK的代碼:
string UTF8ToGBK(const std::string& strUTF8){int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);unsigned short * wszGBK = new unsigned short[len + 1];memset(wszGBK, 0, len * 2 + 2);MultiByteToWideChar(CP_UTF8, 0,LPCSTR(strUTF8.c_str()), -1, LPWSTR(wszGBK), len);len = WideCharToMultiByte(CP_ACP, 0,LPCTSTR(wszGBK), -1, NULL, 0, NULL, NULL);char *szGBK = new char[len + 1];memset(szGBK, 0, len + 1);WideCharToMultiByte(CP_ACP,0, LPCTSTR(wszGBK), -1, szGBK, len, NULL, NULL);//strUTF8 = szGBK;std::string strTemp(szGBK);delete[]szGBK;delete[]wszGBK;return strTemp;}
由於是在mfc中應用,因此又寫了一段mfc裡邊轉成unicode的代碼:
wchar_t * result_str;CString result;//utf-8轉換成unicodeint len = MultiByteToWideChar(CP_UTF8,0,text_out.string(),-1,NULL,0);result_str = new wchar_t[len + 1];memset(result_str,0,len + 1);MultiByteToWideChar(CP_UTF8,0,text_out.string(),-1,result_str,len);//識別結果result = result_str;