轉載請標明出處:http://blog.csdn.net/u012027907
有時經常有統計一篇文章中有多少個單詞,這時候就需要詞頻分析器來解決了。
基本思想:
將檔案中或使用者輸入的字串先存起來,然後從第一個字元開始依次向後掃描,遇到字母,則將其先添加到一個字串中,然後在向後掃描,若還是字母,則將此字串串連到剛才的字串中,若不是字母,則剛才的字串就是一個單詞,這樣依次掃描完所有字元。當然,在掃描時,還要與已經儲存的單詞依次比較,若相同則不再添加,剛才那個單詞的詞數加一即可。
先截個圖SeeSee:
我還將查詢的結果儲存至Access的資料庫中,這裡就用到ADO訪問資料庫的問題,上次我已經將ADO串連類(ADOConn)分裝好了,這次只拿來用就行了。
代碼:
void ADOConn::OnInitADOConn(){ // CString sysPath = "C:\\fg\\MuscicPlayer\\";HRESULT hr; ::CoInitialize(NULL); //初始化OLE/COM環境hr=m_pConnection.CreateInstance("ADODB.Connection"); //建立connection對象// 在ADO操作中建議語句中要常用try...catch()來捕獲錯誤資訊, // 因為它有時會經常出現一些意想不到的錯誤。try { if(SUCCEEDED(hr)){char *dbPath="Provider=Microsoft.Jet.OLEDB.4.0;Data Source=WordData.mdb"; hr=m_pConnection->Open(_bstr_t(dbPath),"","",adModeUnknown);}//_bstr_t strConnect="Provider=SQLOLEDB;Server=LENOVO-PC;DataBase=Study;uid=sa;pwd=zyc123";//m_pConnection->Open(strConnect,"","",adModeUnknown);} //捕獲異常catch(_com_error e){ e.Description();}}void ADOConn::ExitConnect(){//關閉記錄集和串連 if(m_pRecordset!=NULL)m_pRecordset->Close();m_pConnection->Close();//釋放環境::CoUninitialize();}_RecordsetPtr& ADOConn::GetRecordSet(_bstr_t bstrSQL){ try { //串連資料庫,如果conection對象為空白,則重新串連資料庫 if(m_pConnection==NULL) OnInitADOConn(); //建立記錄集對象 m_pRecordset.CreateInstance(__uuidof(Recordset)); //取得表中的記錄 m_pRecordset->Open(bstrSQL,m_pConnection.GetInterfacePtr(),adOpenDynamic,adLockOptimistic,adCmdText); } catch(_com_error e) { e.Description(); } //返回記錄集 return m_pRecordset;}BOOL ADOConn::ExecuteSQL(_bstr_t _bstrSQL){_variant_t RecordsAffected;try{//是否已串連資料庫if(m_pConnection==NULL)OnInitADOConn();m_pConnection->Execute(_bstrSQL,NULL,adCmdText);return true;}catch(_com_error e){e.Description();return false;}}
當然用資料結構來存最好了。
///////////////存放分出的所有單詞的結構體/////////typedef struct WordStore{char word[30]; }wordstore;/////////儲存排好序的單詞及個數的結構體///////////typedef struct WordStoreAll{float frequency;int number;char word[30]; }wordstoreall;
主類:
//////////////////////////////////////////class ReadWord{int AllWord;int Count;bool AddTrue;wordstore WordNode[MAX];wordstoreall Word[MAX];/////資料庫操作使用ADOConn m_AdoConn;_RecordsetPtr m_pRecordset;_ConnectionPtr m_pConnection;char *tablename;public:int Choice;public:ReadWord();int GetCount(){return Count;}bool GetAddTrue(){return AddTrue;}char *ReadFromText();char *ReadFromScreen();void strcopy(char *dest,const char *sour);void Transform(char *str);void GetEveryWord();void Countword();void Frequency();void Order();void Print(int X);void color(int a); ///-------資料庫操作----------------void IsAddToAccess(); //判斷使用者是否將分析結果添加到資料庫void AddToAccess(); //將分析結果添加到資料庫void Select(); //查詢資料庫中的資料void CreateTable();void GetTableName(int n);void ReadTableName();void WriteTNameToFile();};
///////////////////////////////#include "Readword.h"//////////////////////////////////////建構函式////////////// ReadWord::ReadWord(){AddTrue = false;}//////////////////加點顏色SeeSee/////////////////////////////void ReadWord::color(int a) //顏色函數{SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE),a);}/////////////從螢幕上讀取資訊到字串中/////////////char *ReadWord::ReadFromScreen(){char *string;string = (char*)malloc(1000);printf("請輸入:");//fflush(stdout);gets(string);return string;}//////////////////////////////////////////////////////從檔案中讀取資訊到字串中/////////////char *ReadWord::ReadFromText(){FILE *fp;char *string;char FileName[30];string = (char*)malloc(5000);if(NULL == string){printf("記憶體配置失敗!\n");return NULL;}printf("請輸入檔案名稱(不用輸尾碼名):");scanf("%s",FileName);//gets(FileName);strcat(FileName,".txt");fp = fopen(FileName,"rb");if( NULL == fp){printf("檔案開啟錯誤\n");return NULL;}fgets(string,5000,fp);fclose(fp); return string;}////////////////將非單詞的字元轉化為空白格////////////void ReadWord::Transform(char *str){while(*str != '\0'){if(*str<'a'||*str>'z')*str=' ';str++;}}//////////////拷貝函數/////////////////void ReadWord::strcopy(char *dest,const char *sour){while(*sour != '\0'){*dest++=*sour++;}*dest='\0';}//////////////將每個單詞分離出來//////////////////void ReadWord::GetEveryWord(){char tempw[30];char tempc[2]; char *String;bool m_flag=false;int i=0,n=0;if(1 == Choice) String = ReadFromText();elseString = ReadFromScreen();strlwr(String); ///將大寫字母轉化為小寫字母 Transform(String);while(1){if(*String != '\0'){tempc[0]=*String;if(' ' != *String){tempw[i++]=tempc[0];m_flag = true;}else{if(m_flag){m_flag = false;tempw[i]='\0';strcopy(WordNode[n].word,tempw);n++;for(;i>=0;i--)tempw[i]=0;i=0;}}}else if(*(--String) != ' ') //這時為了將檔案最後一個單詞因為沒加標點,而未能統計的最後一個單詞統計進去{tempw[i]='\0';strcopy(WordNode[n].word,tempw);n++;for(;i>=0;i--)tempw[i]=0;i=0;break;}elsebreak;String++;}AllWord=n; //得到分離出的總詞數}//////////////////////對提取出的所有單詞計數////////void ReadWord::Countword(){int i,j;int count=0,m_flag=0; for(i=0;i<AllWord;i++)Word[i].number=1;for(i=0;i<AllWord;i++){m_flag=0;for(j=0;j<AllWord;j++){if(!strcmp(Word[j].word,WordNode[i].word)) //若有相同的單詞,其相應的number加一{Word[j].number++;m_flag=1;}}if(0 == m_flag) //若原有的單詞中沒有相同的,則添加此單詞到Word中{strcopy(Word[count].word,WordNode[i].word);count++;}}Count=count;}////////////////計算每個單詞的頻率//////////////void ReadWord::Frequency(){int i;for(i=0;i<Count;i++){Word[i].frequency=(float)Word[i].number/Count;}}////////////////冒泡排序/////////////////////////void ReadWord::Order(){char tempstr[30];int tempnum;float tempfre;int m,n;for(m=1;m<Count;m++)for(n=0;n<Count-m;n++){if(strcmp(Word[n].word,Word[n+1].word)>0){//////交換單詞strcopy(tempstr,Word[n].word);strcopy(Word[n].word,Word[n+1].word);strcopy(Word[n+1].word,tempstr);////////交換詞數 tempnum = Word[n].number;Word[n].number = Word[n+1].number;Word[n+1].number = tempnum;/////////交換頻率tempfre = Word[n].frequency;Word[n].frequency = Word[n+1].frequency;Word[n+1].frequency = tempfre;}}}///////////////////輸出結果//////////////////////void ReadWord::Print(int X){int i;printf("對該文本的詞頻分析如下:\n");printf("序號\t單詞\t 個數\t頻率\n"); for(i=0;i<X;i++){printf("%-3d\t%-10s\t%d\t%.2f%%\n",i+1,Word[i].word,Word[i].number,Word[i].frequency*100);}}///////////////從資料庫中選取查詢結果//////////////void ReadWord::Select(){m_AdoConn.OnInitADOConn(); //初始化連結庫類//char *sql="Select * From Wordtable";char *q=" ";char Sql[50];char *Sql1 ="SELECT * FROM ";strcopy(Sql,Sql1);strcat(Sql,tablename);strcat(Sql,q);m_pRecordset=m_AdoConn.GetRecordSet((_bstr_t)Sql); //開啟並獲得記錄集m_pConnection.CreateInstance(__uuidof(Connection)); //建立connection對象_variant_t var; char strword[30];int number;double frequen;int i=0;try{ if(!m_pRecordset->adoBOF) //表中資料不為空白,將記錄及指標移到第一條m_pRecordset->MoveFirst();else{printf("\n資料表為空白!\n");m_AdoConn.ExitConnect();return ; } printf("對該文本的詞頻分析如下:\n"); printf("序號\t單詞\t 個數\t頻率\n");// 讀入庫中各欄位並輸出 while(!m_pRecordset->adoEOF) { var = m_pRecordset->GetCollect("Word"); //獲得一條記錄的Word欄位資訊if(var.vt != VT_NULL) strcopy(strword,(LPCSTR)_bstr_t(var)); var = m_pRecordset->GetCollect("Num"); //獲得一條記錄的Num欄位資訊if(var.vt != VT_NULL) number = var.intVal; var = m_pRecordset->GetCollect("Frequency"); if(var.vt != VT_NULL) frequen = var.dblVal; printf("%-3d\t%-10s\t%d\t%.2lf%%\n",i+1,strword,number,frequen*100); i+=1;m_pRecordset->MoveNext(); } } catch(_com_error *e) { //AfxMessageBox(e->ErrorMessage()); e->Description();}m_AdoConn.ExitConnect();}//////////////////使用者是否同意將結果添加到資料庫/////////void ReadWord::IsAddToAccess(){char button;printf("\n是否將結果添加到資料庫?是按[Y],否按其他鍵。\n");fflush(stdin);scanf("%c",&button);if(button == 'Y' || button == 'y'){AddTrue = !AddTrue;GetTableName(1);CreateTable();AddToAccess();}}////////////////將結果添加到資料庫中/////////////////////////void ReadWord::AddToAccess(){m_AdoConn.OnInitADOConn();char *q=" ";char Sql[50];char *Sql1 ="SELECT * FROM ";strcopy(Sql,Sql1);strcat(Sql,tablename);strcat(Sql,q);m_pRecordset=m_AdoConn.GetRecordSet((_bstr_t)Sql); //開啟並獲得記錄集m_pConnection.CreateInstance(__uuidof(Connection)); //建立connection對象_variant_t var; int i; try{if(AddTrue){for(i=0;i<Count;i++){m_pRecordset->AddNew();m_pRecordset->PutCollect("Word",_variant_t(Word[i].word));m_pRecordset->PutCollect("Num",_variant_t((long)Word[i].number));m_pRecordset->PutCollect("Frequency",_variant_t((float)Word[i].frequency));m_pRecordset->Update();}}printf("\n添加成功!\n\n");m_AdoConn.ExitConnect();}catch(...){ printf("操作失敗\n");}}//////////////將使用者輸入的表名儲存到檔案中////////////////void ReadWord::WriteTNameToFile(){char *Table,*str=" ";Table = (char*)malloc(20);strcopy(Table,str);strcat(Table,tablename);strcat(Table,str);FILE *fp;if((fp = fopen("TableNameList.txt","ab+")) == NULL){printf("不能開啟檔案\n");return;}if(fwrite(Table,strlen(Table)*sizeof(char),1,fp) != 1)printf("檔案寫出錯!\n");fclose(fp);}////////////////////存檔案中讀取表名////////////////////////void ReadWord::ReadTableName(){FILE *fp;char *string;char *FileName="TableNameList.txt";string = (char*)malloc(300);if(NULL == string){printf("記憶體配置失敗!\n");return;}fp = fopen(FileName,"rb");if( NULL == fp){printf("檔案開啟錯誤\n");return;}fgets(string,300,fp);fclose(fp);printf("可供查詢的表名如下:\n"); printf("%s\n",string);}//////////////獲得要建立表的名稱或獲得要查詢表的名稱////////////void ReadWord::GetTableName(int n){tablename = (char*)malloc(20);fflush(stdin);if(1 == n){printf("請輸入要建立的表名:");gets(tablename);WriteTNameToFile();}else{ReadTableName();printf("請輸入您要查詢的表名:");gets(tablename); /////////判斷 }}//////////////////////建立表//////////////////void ReadWord::CreateTable(){m_AdoConn.OnInitADOConn();//_bstr_t sql;char Sql[100];char *Sql1 = "CREATE TABLE ";char *Sql2 = " (Word varchar(30),Num int,Frequency float)";strcopy(Sql,Sql1);strcat(Sql,tablename);strcat(Sql,Sql2);if(!m_AdoConn.ExecuteSQL((_bstr_t)Sql)){printf("\n建立表失敗!\n");return;}m_AdoConn.ExitConnect();}
轉載請標明出處:http://blog.csdn.net/u012027907