Paste a document, is the basic idea of writing this program, of course, the specific program and this has some discrepancies, but in general the same. Beg for criticism.
/* Target: Under one folder all LRC lyrics files are imported, analyzed, and the final results are expressed in txt. */
/* Analysis: As long as the inverted index can easily be entered into the lyrics of the index file into the il.txt, so there should be a function to implement the input inverted index, and then set up a file output txt:
0.bool lyricsindex_out (lyric_index_list index_list[],int m) {}
Relevant are:
struct word_item{//Word Item header storage
String Word;
int freq=0;
Word_doc *head_docid;
}; Word_item Lyrics_head=new word_item[];
struct word_doc{//Word Item location Store
int text_number;
Word_doc *next;
}*head_docid;
Word_doc *temp;
FStream fout ("Index_lyrics.txt");
fout<< "Doc" <<SETW << "Freq" <<SETW (+) << "<<SETW" << "list" <<endl;
for (int i=0,int j=0;index_list[i]->next!=null;i++) {
FOUT<<INDEX_LIST[I]->WORD<<SETW (<<INDEX_LIST[I]->FREQ<<SETW) << "<<SETW (12);
temp=index_list[i]->head_docid;
for (j=0;temp!=null;j++) {
fout<<temp->text_number<< ",";
temp=temp->next;
}
}
Output TXT file wording:
1. There should be a class lyric_index_analysis
Process the LRC lyrics file from the input in the folder, and filter out "[...]", leaving only the subject of the lyrics, then the main body of the lyrics to analyze, with a space, enter, punctuation marks as the boundary, each word is parsed out,
Deposit
Lyrics_head[],
For example:
Lyrics_head[0].word=xiejiang;
Lyrics_head[0].freq=1;
Lyrics_head[0].head_docid=new Word_item;
lyrics_head[0].head_docid->text_number=0;
lyrics_head[0].head_docid->next=null;
Each time a document is processed, the maximum value of the array that will eventually generate the word header is remembered, which can be set to a function,
Int get_lyrics_head (word_item lyrics_head[],string filename)
Returns an array and a maximum value when returned
2. After the class has finished processing all the documents, there should be a
Lyrics_head[] array of size n,
Then set up a function to sort all the words in alphabetical order
The array is still returned after
Void Lyric_mergesort (Word_item lyrics_head[],int left,int right)
3. To the last step, Index_list analy_setup_index (Word_item lyrics_head[],int N)
Create a linked list
Input lyrics_head[] and N
The array is traversed, and if two is the same word in the next number of words then they are merged
Use a pointer to lyrics_head[] and Word_item until the same part of the array no longer exists.
Then put two in a list.
Because the merge sort is stable, the two Lyrics_head[].word can be connected directly to each other. The following methods: for example
struct index_list{//is used to generate the final index table
String Word;
int freq=0;
Word_doc *head_docid;
Index_list* Next;
};
word_doc* Lyrics_doc; =new word_doc[];
Index_list Lyric_index_list=new index_list[];
Lyric_index_list[0].word=lyrics_head[0].word;
Lyric_index_list[0].head_docid=lyrics_head[0].head_docid;
Lyric_index_list[0].freq=lyrics_head[0].freq;
lyric_index_list[0]->next=null;
Lyrics_doc=lyrics_head[0].head_docid;
for (int i=0,int j=0;i<n;i++) {
If (Lyric_index_list[j].word==lyrics_head[i].word) {
lyric_index_list[j].freq++;
lyrics_doc->next=lyrics_head[i].head_docid;
lyrics_doc=lyrics_doc->next;
}
else{
j + +;
LYRIC_INDEX_LIST[J-1].NEXT=LYRIC_INDEX_LIST[J];
lyric_index_list[j]->next=null;
Lyric_index_list[j].word=lyrics_head[i].word;
Lyric_index_list[j].head_docid=lyrics_head[i].head_docid;
Lyric_index_list[j].freq=lyrics_head[i].freq;
}
}
Return lyric_index_list;
*/
Okay, crap, less code .
First, the. cpp where main is located:
#include <iostream> #include <fstream> #include <string> #include <iomanip> #include < stdio.h> #include <io.h> #include "analysis_lyrics.h" using namespace Std;int Main () {lyric_index_analysis lyric_a;//set up lyrics analysis of the class string str = "+"; int bound = 0, upper = 0, i = 1;//has n number, then there is superscript N, which is the upper and lower bounds of the term table word_item *temp = new Word_item [max_size-500];//used to merge the exchange needs of the struct _finddata_t fileinfo;//file processing, look for the default directory of lyrics file long pfile;//call successfully returned 0, otherwise return -1//cout < < "<< PFile <<" documents are: "<< fileinfo.name <<" numbered: 1 "<< endl;if ((pFile = _findfirst ("*.LRC", &fileinfo)) = =-1) {cout << "does not exist. lrc file" << Endl;return 0;} else {cout << "<< i++ <<" documents are: "<< fileinfo.name <<" numbered: 1 "<< Endl; Lyric_a.lyrics_input (Fileinfo.name, Upper, bound, 1);//input file call cout << "Nether" << bound << "Upper bound" "<< Upper << Endl;while (_findnext (pFile, &fileinfo) = = 0) {cout <<" first "<< I << "Documents are:" << fileinfo.name << "numbered:" << i << endl;bound = upper; Lyric_a.lyrics_input (Fileinfo.name, Upper, bound, I); cout << "Nether" << bound << "Upper bound" < < upper << endl;i++;}} _findclose (pFile); bound = 0;cout << "The lower bound of the total Word table" << bound << "Upper bound" << Upper << Endl ; Lyric_a.lyric_mergesort (Lyric_a.return_lyrics_head (), temp, bound, upper-1); Lyric_a.print (upper); Lyric_a.lyricsindex_out (upper);} The small code below is a program that iterates through a document in a folder/* #include <iostream> #include <io.h> using namespace Std;int main () {struct _ finddata_t Fileinfo;long hfile;if ((hfile = _findfirst ("*.LRC", &fileinfo)) = =-1) return-1;else {cout << Filein Fo.name << endl;while (_findnext (hfile, &fileinfo) = = 0) {cout << fileinfo.name << Endl;}} _findclose (hfile); return 0;} */
The
is followed by the class that handles the lyrics file. h:
#include <iostream>using namespace std;static const int max_size = 4000;static const int max_size = 200;struct Word_d OC {//Word item location store, contains the document number where the word is located, the next word entry is in the position int text_number; Word_doc *next = nullptr;}; struct Word_item {//For the first time the entire document is traversed, the header item of each word is stored, containing the word and the next word item, string Word; Word_doc *head_docid = nullptr;}; struct Index_list {//is used to generate the final index table, contains the total number of words, words exist in the document, the word exists in the index of the document position, the next word string word;int freq = 0; Word_doc *head_docid = nullptr;index_list* next = nullptr;}; Class Lyric_index_analysis {//parses the main class of the LRC lyrics file, parses the lyrics document with each function and builds the index document private:word_item* lyrics_head;index_list* L_h_ List;public:int bound = 0, upper = 0;//represents the upper and lower bounds of the current term table, and the upper and lower bounds differ with the number of documents Lyric_index_analysis () {lyrics_head = new Word_ Item[max_size]; L_h_list = new index_list;//establishes an index linked list};~lyric_index_analysis () {delete[]lyrics_head;index_list*temp = L_H_List;while ( Temp! = nullptr) {temp = L_h_list->next;delete l_h_list;}}; word_item* Return_lyrics_head () {Return lyrics_head;} index_list* return_l_h_list () {Return l_h_list;} void Lyrics_input (String fileName, int& upper, int bound, int number);//Open file input lyrics, the parameters are: file name, the upper bound of the term table, the lower bound of the term table, the document's numbering. Call Insert_word_list (Word_item lyrics_head[], int &upper,int bound, char* elem,int position), Final return term table bool Insert_word _list (Word_item lyrics_head[], int& Upper, int bound, char* elem, int position);//Inserts the word into the Word table, if it is not inserted, then exits directly (limited to the current document) , the parameters are: Word Item table, upper bound, lower bound, Word, document number void Lyric_mergesort (Word_item lyrics_head[], Word_item temp[], int left, int. right);// The table of the word entries is merged to sort void Analys_setup_index (Word_item lyrics_head[], int n);//Word entry establishes the final index table bool Lyricsindex_out (int n) {//index_ List l_index_list[], int m) {//The final indexer output, input in Lyrics_index_list.txt analys_setup_index (Lyrics_head, n); Ofstream Fout ("Lyrics_index_list.txt", Ios::trunc); FOUT.SETF (ios::left)//fout << setw << word << setw (5) << "Freq" << setw (3) << "<<" DocID "<< endl;while (l_h_list! = nullptr) {word_doc* temp = l_h_list->head_docid;fout << l_h_list->word << "#" << l_h_list-≫freq << "@"; cout << L_h_list->word << "appearing in"; while (temp->next! = nullptr) {cout << temp- >text_number << ","; Fout << temp->text_number << ","; temp = temp->next;} Fout << temp->text_number << endl;cout << temp->text_number;cout << "document, Frequency is" << L_ H_list->freq << Endl; L_h_list = L_h_list->next;} Fout.close (); return false;} void print (int n) {for (int i = 0; i < n; i++) {cout << lyrics_head[i].word << "appears in" << lyrics_head[ I].head_docid->text_number << "Document" << Endl;}};
The
is then the concrete. cpp file for the class:
#include <fstream> #include <string> #include <iomanip> #include "analysis_lyrics.h"//Open file to enter lyrics, The parameters are: file name, upper bound of the term table, lower bound of the term table, and number of the document. Call Insert_word_list (Word_item lyrics_head[], int &upper,int bound, char* elem,int position), and eventually return to the term table void Lyric_index _analysis::lyrics_input (string filename, int& upper, int bound, int number) {//lyrics_head = new word_item[max_size]; Ifstream fin (filename), if (!fin.is_open ()) {cout << "file read failed!\n"; exit (0);} String Str;getline (Fin, str);//Iterate through the entire document, read one line at a time, and then parse do {cout << str << Endl;char c[max_size] = {'} '};int i = 0, IC = 0;for (i = 0; Str[i]! = ') '; i++); for (int j = i + 1; str[j]! = ' \ r ' &&str[j]! = ' \ n ' &&str[j]! = ' + '; j + +) {//Remove the character after the quotation mark, but if it is t then do not go if ( (int) Str[j] = () {while (str[j]! = "' &&str[j]! = ' \ r ' &&str[j]! = ' \ n ' &&str[j]! = ' \") {J++;if (Str[j] = = ' t ') {j--;break;}} if (str[j] = = ' \ r ' | | str[j] = = ' \ n ' | | str[j] = = ' + ') break; Remove Case if (((int) str[j] >=) && ((int)STR[J] <=)) c[ic++] = (int) str[j] + 32;elsec[ic++] = str[j];//cout << Daxiao---------------> << (int ) str[j] << Endl;} cout << "split string:" << c << endl;const char *d = "[]-;,:/?!. () ";//with these characters as the delimiter char *p = Null;char *next_p = Null;p = strtok_s (c, D, &next_p); while (p) {insert_word_list (Lyrics_head, Upper, bound, p, number)//cout << "superscript is" <<upper<< "divided:" << lyrics_head[upper-1].word << Endl;p = strtok_s (NULL, D, &next_p);} Getline (FIN, str);} while (!fin.eof ()); Fin.close ();};/ /Insert the word into the Word table, if it is not inserted, then exit directly (only the current document), the parameters are: Word Item table, upper bound, lower bound, Word, document number bool Lyric_index_analysis::insert_word_list (Word_item Lyrics_head[], int& Upper, int bound, char* elem, int position) {for (int i = bound; i<upper; i++) {if (lyrics_head [I].word = = Elem) return false;} Lyrics_head[upper].head_docid = new Word_doc;lyrics_head[upper].head_docid->text_number = position;lyrics_head[ Upper].head_docid->next = Nullptr;lyrics_head[upper].word= Elem;//cout << "split Out (back):" << lyrics_head[upper].word << Endl;upper++;return true;};/ /The table of the word items is sorted by the sort void Lyric_index_analysis::lyric_mergesort (Word_item lyrics_head[], Word_item temp[], int left, int right) {int I, j, K, Mid = (left + right)/2;if (left = right) return; Lyric_mergesort (Lyrics_head, temp, left, mid); Lyric_mergesort (Lyrics_head, temp, mid + 1, right); for (i = mid; I >= left; i--) temp[i] = lyrics_head[i];for (j = 1; j <= Right-mid; J + +) Temp[right-j + 1] = lyrics_head[j + mid];for (i = left, j = right, K = left; k <= right; k++) if (Temp[i].word < = Temp[j].word) Lyrics_head[k] = temp[i++];elselyrics_head[k] = temp[j--];};/ /Word entry establishes the final index table void Lyric_index_analysis::analys_setup_index (Word_item lyrics_head[], int n) {Word_doc *temp;// Index for the position of each word item that appears in the document index_list* t_l_h_list = L_h_list;t_l_h_list->word = Lyrics_head[0].word;t_l_h_list->freq = 1 ; t_l_h_list->head_docid = Lyrics_head[0].head_docid;t_l_h_list->next = Nullptr;temp = T_L_H_LisT->head_docid;cout << "Word is" << t_l_h_list->word << "appearing in" << Temp->text_number << "Document, at this time the frequency is" << t_l_h_list->freq << endl;for (int i = 1; i < n; i++) {while (Lyrics_head[i-1].word = = Lyrics_head[i].word) {Temp->next = Lyrics_head[i].head_docid;temp = Temp->next;cout << "Word Item and previous equality, appears in" << temp->text_number << "document, at this time the frequency is" << T_l_h_list->freq + 1 << endl;t_l_h_list->freq++ ; ++i;} if (i = = N) break;index_list* temp_l_h_list = new Index_list;temp_l_h_list->word = lyrics_head[i].word;temp_l_h_list- >freq = 1;temp_l_h_list->head_docid = Lyrics_head[i].head_docid;temp = Temp_l_h_list->head_docid;temp_l_h_ List->next = Nullptr;t_l_h_list->next = Temp_l_h_list;t_l_h_list = T_l_h_list->next;cout << "Word is" < < T_l_h_list->word << "appears in << temp->text_number <<" document, at which time the frequency is "<< t_l_h_list->freq << Endl;}};
C + + under the LRC lyrics file retrieval (self-written search lyrics files, record bit)