C# 中文分詞[基於統計的樸素貝葉斯演算法]

來源:互聯網
上載者:User
 
主要思想:
1. 要有一個語料庫
2. 統計每個詞出現的頻率, 一會來做樸素貝葉斯候選
3. 舉例: 中國人民共和國的
    其中語料庫中有中國, 人民, 中國人, 共和國等等的片語. 
現在輸入: 中國人都愛中華人民共和國;
分詞的時候取max( 各種分發得到的score ); 
例如: solution1:中國人_都愛中華人民_共和國
solution2:中國_人_都愛中華人民_共和國
solution3:中國_人_都愛_中華_人民_共和國 

              bestSegSolution = max( solutions(segSlution[i] ));

      4.對於一句漢字的分詞可以看做

               seg( StringIn ) =  firPart + seg(StringIn – firPart); //   我用score來衡量當前分詞結果的好壞

      6。 樸素貝葉斯的意思就是: 分詞後的, 兩個詞之間是相互獨立的, 也就是後者的出現與前者無關

5. 這個只是初級版, 很簡單, 需要再加點東西, 結果會更加的完美.. 當然, 按照做事情的原則, 都是從簡單開始做的, 再努力
 
using System;using System.Collections.Generic;using System.Text;using System.Collections;using System.Windows.Forms;using System.IO;using System.Diagnostics;namespace ChineseWordSeg{    class NaiveBayes    {        private string wordLibPath = "../WordLib/pku_training.txt";//所用的訓練庫是pku的語料庫.
        bool trained = false;        private Dictionary<string, long> wordLib = new Dictionary<string, long>();        private Dictionary<string, long> singleWordLib = new Dictionary<string, long>();        int maxLen = 0;        long maxScore = 0;        private string segPos = ""; //記錄單句的分割點, 按照標點等非漢字的字元分開        private string segSentence = ""; // 記錄整個段落的        // 是不是中文字元
        bool isChineseWord(char chr ){            if (chr >= 0x4E00 && chr <= 0x9FFF) return true;            return false;        }        public void trainDate( string path ) { // 統計每個詞出現的次數           

//1. 統計每個片語頻率, naiveBayes消歧. 將一個組合不同的方式取得較大機率的那個分組方式.
// 難道每個詞還是hash一下麼?
//2. 統計每個字的頻率, 就像向心力那樣... 看看到底哪兩個字比較容易聯絡到一起 這個是一句廢話,因為我沒這麼去做
            wordLib.Clear();

            DirectoryInfo dirInfo = new DirectoryInfo(path);
            DirectoryInfo tmpDir = dirInfo.Parent;
            string savePath = tmpDir.FullName;
            FileInfo fInfo = new FileInfo(wordLibPath);
            string fileNamePre = fInfo.Name;
            savePath += "\\" + fileNamePre + "_trained";
            FileInfo infoOfDB = new FileInfo(savePath);

            if( File.Exists(savePath) && infoOfDB.Length > 0 ){
              
                StreamReader sr1 =
                                new StreamReader(@savePath);
                char[] sep = { ' '};
               
                while (sr1.Peek()!=-1)
                {
                    string[] keyValue = sr1.ReadLine().Split(sep);

                    wordLib[keyValue[0]] = Convert.ToInt32(keyValue[1]);

                }

                    return;
            }
           
            if ( !File.Exists( path ) ) {
                MessageBox.Show("ÓïÁÏ¿â·¾¶ÓÐ´í£¬Çë¼ì²é");
                return;
            }

            Stopwatch tm = new Stopwatch();
            tm.Start();
            StreamReader sr =
                        new StreamReader(@path,
                        System.Text.Encoding.GetEncoding("gb2312"));
           
            char tmpChar;
            string tmpStr;
            char[] tmpCArray = new char[100];

            {
                tmpStr = "";
                bool flag = false;
                long tmpVal = 0;
                while (sr.Peek() != -1 ) {
                    tmpChar = (char)sr.Read();

                    if (isChineseWord( tmpChar ) )
                    {
                        flag = true;
                  /*
                                if (flag == true)
                                {
                                    string singleWord = (tmpChar).ToString();
                                    if (singleWordLib.ContainsKey(singleWord))
                                    {
                                        singleWordLib.TryGetValue(singleWord, out tmpVal);
                                        singleWordLib[singleWord] = tmpVal + 1;
                                    }
                                    else
                                        singleWordLib.Add(singleWord, 1);
                                    // ͳ¼Æÿ¸ö×ÖµÄ
       
                                }*/
       
                        tmpStr += (char)tmpChar;
                    }
                    else
                    {
                        tmpStr = tmpStr.Trim();
                        if (flag == true)
                        {
                            if( tmpStr.Length > 1 ){
                                if (wordLib.ContainsKey(tmpStr))
                                {
                                    wordLib.TryGetValue(tmpStr, out tmpVal);
                                    wordLib[tmpStr]=tmpVal + 1;
                                }
                                else
                                    wordLib.Add(tmpStr, 1);
                            }
                            else{
                                if (singleWordLib.ContainsKey(tmpStr))
                                {
                                    singleWordLib.TryGetValue(tmpStr, out tmpVal);
                                    singleWordLib[tmpStr] = tmpVal + 1;
                                }
                                else
                                    singleWordLib.Add(tmpStr, 1);
                            }
                            // ͳ¼Æÿ¸ö´Ê×éµÄ
                        }
                        tmpStr = "";
                        flag = false;
                    }

                    if (maxLen < tmpStr.Length)
                    {
                        maxLen = tmpStr.Length;
                        // ¼Ç¼µ¥´Ê×î´óµÄ³¤¶È...
                    }
                }
            }
            sr.Close();

            StreamWriter sw = new StreamWriter(savePath);

           foreach ( string key in wordLib.Keys ) {
               sw.WriteLine( key + " " + wordLib[key]);
           }
           

           sw.Close();
          
           tm.Stop();

           MessageBox.Show(tm.Elapsed.Milliseconds.ToString(), "training done");
        }

//將分段好的結果傳回.
        public string getSegedString( string  strIn ) {
            char[] seprator = { 's' };
            string[] segSplit = segSentence.Split(seprator);
            List<int> segP = new List<int>();
            segP.Clear();
            int j, i;
            int cntSegPos = 0;

            for( i = 0; i < segSplit.Length; i ++ ){
                if (segSplit[i].Length > 0)
                {
                    segP.Add(Convert.ToInt16(segSplit[i]));
                    cntSegPos++;
                }
            }

            char[] cArray = new char[512];
            cArray = strIn.ToCharArray();

            string strOut = "";
            bool flag = true;
        
            for (i = 0, j = 0; i < strIn.Length; i++)
            {
               
               
                while (j < cntSegPos && segP.Contains(i))
                {
                    segP.Remove(i);
                    flag = !flag;
                    if (flag)
                        strOut += ")";
                    else strOut += "(";
                    j++;
                }
                strOut += cArray[i];
            }
            if (j < cntSegPos) strOut += ")";
            return strOut;
        }

// 恩, 做樸素貝葉斯分詞
        public string doNaiveBayesSegmentation(string strIn, string trainDataPath){

            if( !trained )
            {
                trained = true;
                trainDate(trainDataPath);
            }

            string strTmp = "";
            char[] charBuffer = new char[4096];
            charBuffer = strIn.ToCharArray();
            int i = 0, len = strIn.Length;

            while (  i < len )
            {
                while (  i < len && isChineseWord(charBuffer[i]) ) strTmp += charBuffer[i++];

                {
                   if(strTmp.Length > 0)
                   {
                        maxScore = 0;
                        segPos = "";
                        naviveBayesSeg(strTmp, 0, "", i-strTmp.Length);
                        segSentence += segPos;
                   }
                   strTmp = "";
                }

                while (i < len && !isChineseWord(charBuffer[i])) i++;
            }

            return getSegedString(strIn);

        }

// 分詞的具體實現, bestSegSolution = max( solutions(segSlution[i] ));

                                    對於一句漢字的分詞可以看做 seg( StringIn ) =  firPart + seg(StringIn – firPart);

                                    我用score來衡量當前分詞結果的好壞
        public void naviveBayesSeg(string strIn, long score, string seg, int tPos){

            if ( true ) {
                if( score > maxScore ) {
                    segPos = seg;
                    maxScore = score;

                }
               // return;
            }
            int strLen = strIn.Length;
            string firStr = "";
            int i = 0;
            for ( i = 1; i <= strIn.Length; i++) {
                firStr = strIn.Substring(0, i);
                if (wordLib.ContainsKey(firStr))
                {
                    naviveBayesSeg(strIn.Substring(i), score + wordLib[firStr], seg + (tPos + i - firStr.Length).ToString() + "s" + (tPos + i).ToString()+"s", tPos + i);
                }
            }

            if( i > strIn.Length && i > 1)
                naviveBayesSeg(strIn.Substring(1), score, seg, tPos + 1);

        }
    }
}

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.