主要思想:
1. 要有一個語料庫
2. 統計每個詞出現的頻率, 一會來做樸素貝葉斯候選
3. 舉例: 中國人民共和國的
其中語料庫中有中國, 人民, 中國人, 共和國等等的片語.
現在輸入: 中國人都愛中華人民共和國;
分詞的時候取max( 各種分發得到的score );
例如: solution1:中國人_都愛中華人民_共和國
solution2:中國_人_都愛中華人民_共和國
solution3:中國_人_都愛_中華_人民_共和國
bestSegSolution = max( solutions(segSlution[i] ));
4.對於一句漢字的分詞可以看做
seg( StringIn ) = firPart + seg(StringIn – firPart); // 我用score來衡量當前分詞結果的好壞
6。 樸素貝葉斯的意思就是: 分詞後的, 兩個詞之間是相互獨立的, 也就是後者的出現與前者無關
5. 這個只是初級版, 很簡單, 需要再加點東西, 結果會更加的完美.. 當然, 按照做事情的原則, 都是從簡單開始做的, 再努力
using System;using System.Collections.Generic;using System.Text;using System.Collections;using System.Windows.Forms;using System.IO;using System.Diagnostics;namespace ChineseWordSeg{ class NaiveBayes { private string wordLibPath = "../WordLib/pku_training.txt";//所用的訓練庫是pku的語料庫.
bool trained = false; private Dictionary<string, long> wordLib = new Dictionary<string, long>(); private Dictionary<string, long> singleWordLib = new Dictionary<string, long>(); int maxLen = 0; long maxScore = 0; private string segPos = ""; //記錄單句的分割點, 按照標點等非漢字的字元分開 private string segSentence = ""; // 記錄整個段落的 // 是不是中文字元
bool isChineseWord(char chr ){ if (chr >= 0x4E00 && chr <= 0x9FFF) return true; return false; } public void trainDate( string path ) { // 統計每個詞出現的次數
//1. 統計每個片語頻率, naiveBayes消歧. 將一個組合不同的方式取得較大機率的那個分組方式.
// 難道每個詞還是hash一下麼?
//2. 統計每個字的頻率, 就像向心力那樣... 看看到底哪兩個字比較容易聯絡到一起 這個是一句廢話,因為我沒這麼去做
wordLib.Clear();
DirectoryInfo dirInfo = new DirectoryInfo(path);
DirectoryInfo tmpDir = dirInfo.Parent;
string savePath = tmpDir.FullName;
FileInfo fInfo = new FileInfo(wordLibPath);
string fileNamePre = fInfo.Name;
savePath += "\\" + fileNamePre + "_trained";
FileInfo infoOfDB = new FileInfo(savePath);
if( File.Exists(savePath) && infoOfDB.Length > 0 ){
StreamReader sr1 =
new StreamReader(@savePath);
char[] sep = { ' '};
while (sr1.Peek()!=-1)
{
string[] keyValue = sr1.ReadLine().Split(sep);
wordLib[keyValue[0]] = Convert.ToInt32(keyValue[1]);
}
return;
}
if ( !File.Exists( path ) ) {
MessageBox.Show("ÓïÁÏ¿â·¾¶ÓÐ´í£¬Çë¼ì²é");
return;
}
Stopwatch tm = new Stopwatch();
tm.Start();
StreamReader sr =
new StreamReader(@path,
System.Text.Encoding.GetEncoding("gb2312"));
char tmpChar;
string tmpStr;
char[] tmpCArray = new char[100];
{
tmpStr = "";
bool flag = false;
long tmpVal = 0;
while (sr.Peek() != -1 ) {
tmpChar = (char)sr.Read();
if (isChineseWord( tmpChar ) )
{
flag = true;
/*
if (flag == true)
{
string singleWord = (tmpChar).ToString();
if (singleWordLib.ContainsKey(singleWord))
{
singleWordLib.TryGetValue(singleWord, out tmpVal);
singleWordLib[singleWord] = tmpVal + 1;
}
else
singleWordLib.Add(singleWord, 1);
// ͳ¼Æÿ¸ö×ÖµÄ
}*/
tmpStr += (char)tmpChar;
}
else
{
tmpStr = tmpStr.Trim();
if (flag == true)
{
if( tmpStr.Length > 1 ){
if (wordLib.ContainsKey(tmpStr))
{
wordLib.TryGetValue(tmpStr, out tmpVal);
wordLib[tmpStr]=tmpVal + 1;
}
else
wordLib.Add(tmpStr, 1);
}
else{
if (singleWordLib.ContainsKey(tmpStr))
{
singleWordLib.TryGetValue(tmpStr, out tmpVal);
singleWordLib[tmpStr] = tmpVal + 1;
}
else
singleWordLib.Add(tmpStr, 1);
}
// ͳ¼Æÿ¸ö´Ê×éµÄ
}
tmpStr = "";
flag = false;
}
if (maxLen < tmpStr.Length)
{
maxLen = tmpStr.Length;
// ¼Ç¼µ¥´Ê×î´óµÄ³¤¶È...
}
}
}
sr.Close();
StreamWriter sw = new StreamWriter(savePath);
foreach ( string key in wordLib.Keys ) {
sw.WriteLine( key + " " + wordLib[key]);
}
sw.Close();
tm.Stop();
MessageBox.Show(tm.Elapsed.Milliseconds.ToString(), "training done");
}
//將分段好的結果傳回.
public string getSegedString( string strIn ) {
char[] seprator = { 's' };
string[] segSplit = segSentence.Split(seprator);
List<int> segP = new List<int>();
segP.Clear();
int j, i;
int cntSegPos = 0;
for( i = 0; i < segSplit.Length; i ++ ){
if (segSplit[i].Length > 0)
{
segP.Add(Convert.ToInt16(segSplit[i]));
cntSegPos++;
}
}
char[] cArray = new char[512];
cArray = strIn.ToCharArray();
string strOut = "";
bool flag = true;
for (i = 0, j = 0; i < strIn.Length; i++)
{
while (j < cntSegPos && segP.Contains(i))
{
segP.Remove(i);
flag = !flag;
if (flag)
strOut += ")";
else strOut += "(";
j++;
}
strOut += cArray[i];
}
if (j < cntSegPos) strOut += ")";
return strOut;
}
// 恩, 做樸素貝葉斯分詞
public string doNaiveBayesSegmentation(string strIn, string trainDataPath){
if( !trained )
{
trained = true;
trainDate(trainDataPath);
}
string strTmp = "";
char[] charBuffer = new char[4096];
charBuffer = strIn.ToCharArray();
int i = 0, len = strIn.Length;
while ( i < len )
{
while ( i < len && isChineseWord(charBuffer[i]) ) strTmp += charBuffer[i++];
{
if(strTmp.Length > 0)
{
maxScore = 0;
segPos = "";
naviveBayesSeg(strTmp, 0, "", i-strTmp.Length);
segSentence += segPos;
}
strTmp = "";
}
while (i < len && !isChineseWord(charBuffer[i])) i++;
}
return getSegedString(strIn);
}
// 分詞的具體實現, bestSegSolution = max( solutions(segSlution[i] ));
對於一句漢字的分詞可以看做 seg( StringIn ) = firPart + seg(StringIn – firPart);
我用score來衡量當前分詞結果的好壞
public void naviveBayesSeg(string strIn, long score, string seg, int tPos){
if ( true ) {
if( score > maxScore ) {
segPos = seg;
maxScore = score;
}
// return;
}
int strLen = strIn.Length;
string firStr = "";
int i = 0;
for ( i = 1; i <= strIn.Length; i++) {
firStr = strIn.Substring(0, i);
if (wordLib.ContainsKey(firStr))
{
naviveBayesSeg(strIn.Substring(i), score + wordLib[firStr], seg + (tPos + i - firStr.Length).ToString() + "s" + (tPos + i).ToString()+"s", tPos + i);
}
}
if( i > strIn.Length && i > 1)
naviveBayesSeg(strIn.Substring(1), score, seg, tPos + 1);
}
}
}