1. Description of the problem
Markov chain algorithm is used to generate a random English, its idea is very simple. Read the data first, then divide the read data into prefixes and suffixes, and obtain the suffix randomly by prefix, which produces a readable random English.
To illustrate the convenience, suppose we have the following passage:
Copy Code code as follows:
Show your flowcharts and conceal your tables and I'll be mystified. Show your tables and your flowcharts'll be obvious.
Assuming the length of the prefix is 2, we get the following data after processing the input, we first get a prefix, and then randomly select a word in the suffix list of prefixes, then change the prefix to repeat the process, so that the sentence we produce will be readable.
Here is the processed data:
Copy Code code as follows:
Prefix suffix
Show your flowcharts tables
Your flowcharts and would
Flowcharts and conceal
Flowcharts would be
Your tables and and
would be mystified. Obvious.
Be mystified. Show
Be obvious. (end)
Processing this text of the Markov chain algorithm will first show your, and then randomly remove flowcharts or table two words, assuming that the choice is flowcharts, then the new prefix is your flowcharts, similarly, select Table, The new prefix is your table, with the new prefix your flowcharts, and then select its suffix again, which is randomly selected in and and will, repeating the process to produce a readable text. The detailed description is as follows:
Copy Code code as follows:
Set W1 and W2 as the first two words of text
Output W1 and W2
Cycle:
Randomly selected W3, which is one of the suffixes of the W1 W2 in the text
Print W3
Convert W1 and W2 to W2 and W3 respectively.
Repeating loops
2.awk Program
Markov chain algorithm is not difficult, we will see later, the C language to solve this problem will be quite troublesome, and in awk it takes only 5 minutes to get it done. This is simply a question of demonstrating the virtues of awk.
The associated array in awk can be used to represent the relationship between the prefix and the suffix. The procedure is as follows:
# Markov.awk:markov chain algorithm for 2-word prefixes
BEGIN {Maxgen = 10000; Nonword = "\ n"; W1 = W2 = Nonword}
{for (i = 1; I <= NF; i++) { # read all words
statetab[w1,w2,++nsuffix[w1,w2]] = $ I
w1 = W2
w2 = $i
}
} end
{
STATETAB[W1,W2,++NSUFFIX[W1,W2]] = nonword # add tail
w1 = W2 = Nonword for
(i = 0; i < Maxgen i++) {# generate
r = Int (rand () *NSUFFIX[W1,W2]) + 1 # nsuffix >= 1
p = Statetab[w1,w2,r]
if (p = = nonword)
exit
Print p
w1 = W2 # advance Chain
= W2 = P
}< c24/>}
3. C + + Program
The main difficulty of this problem is to obtain the suffix by prefix random, in C + +, we can use map to realize the correspondence between prefix and suffix, so as to get higher development efficiency.
/* Copyright (C) 1999 Lucent Technologies * * * excerpted from ' Practice of programming '//* by Brian W. Kernighan and Rob Pike * * #include <time.h> #include <iostream> #include <string> #include <deque> #include
<map> #include <vector> using namespace std;
const int NPREF = 2; const char nonword[] = "\ n"; Cannot appear as real line:we remove newlines const int Maxgen = 10000;
Maximum words generated typedef deque<string> PREFIX; Map<prefix, vector<string> > Statetab;
Prefix-> suffixes void build (prefix&, istream&);
void Generate (int nwords);
void Add (Prefix&, const string&);
Markov main:markov-chain random text generation int main (void) {int nwords = Maxgen; Prefix Prefix;
Current input prefix Srand (time (NULL));
for (int i = 0; i < npref i++) Add (prefix, nonword);
Build (prefix, CIN);
Add (prefix, nonword);
Generate (Nwords);
return 0; }//Build:Read input words, build state table void Build (prefix& Prefix, istream& in) {string buf;
while (in >> buf) Add (prefix, buf); }//Add:add Word to suffix deque, update prefix void Add (prefix& prefix, const string& s) {if (Prefix.size ()
= = Npref) {statetab[prefix].push_back (s);
Prefix.pop_front ();
} prefix.push_back (s);
//Generate:produce output, one word per line void generate (int nwords) {Prefix Prefix;
int i;
for (i = 0; i < npref i++) Add (prefix, nonword);
for (i = 0; i < nwords i++) {vector<string>& suf = Statetab[prefix];
Const string& w = suf[rand ()% suf.size ()];
if (w = = Nonword) break;
cout << w << "\ n"; Prefix.pop_front ();
Advance Prefix.push_back (W); }
}
4. C procedure
If you want the program to run fast enough, it can only be done in lower-level languages. When we use C language to achieve, we have to consider a variety of issues. First, the first question to be faced is how do you represent the relationship between prefixes and suffixes?
Here using the prefix of the key, suffix for the value of the way to store the relationship between the prefix and suffix, we know that the hash table to find the fastest, so, the use of hash table is also reasonable, just see you can think, with a prefix for key, based on the above ideas, and more careful, There's no big problem.
/* Copyright (C) 1999 Lucent Technologies * * * excerpted from ' Practice of programming '//* by Brian W. Kernighan
and Rob Pike * * * Markov chain random text generator. * #include <string.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <t ime.h> #include "eprintf.h" enum {npref = 2,/* Number of prefix words/Nhash = 4093,/* Size of State hash
Table array */Maxgen = 10000/* Maximum words generated */};
typedef struct State State;
typedef struct SUFFIX Suffix; struct State {/* prefix + suffix list */char *pref[npref]; /* Prefix words * * Suffix *suf; /* List of suffixes * * State *next;
/* Next in hash table/}; struct Suffix {/* List of suffixes/char *word; /* suffix * * suffix *next;
/* Next in List of suffixes */};
State *lookup (char *prefix[], int create);
void build (char *prefix[], file*);
void Generate (int nwords);
void Add (char *prefix[], char *word);State *statetab[nhash]; /* Hash Table of States */char nonword[] = "\ n"; /* cannot appear as real word/* Markov main:markov-chain random text generation/int main (void) {int I, nwords
= Maxgen; Char *prefix[npref];
/* Current INPUT prefix */int c;
Long seed;
Setprogname ("Markov");
Seed = time (NULL);
Srand (seed);
for (i = 0; i < npref i++)/* Set up initial prefix/prefix[i] = Nonword;
Build (prefix, stdin);
Add (prefix, nonword);
Generate (Nwords);
return 0; const int multiplier = 31; /* for hash ()///* Hash:compute hash value for array of NPREF strings/unsigned int hash (char *s[npref]) {unsigned
int h;
unsigned char *p;
int i;
h = 0;
for (i = 0; i < Npref. i++) for (P = (unsigned char *) s[i]; *p!= ' "; p++) H = multiplier * H + *p;
return h% Nhash; }/* Lookup:search for prefix; Create if requested. */* Returns pointer if present or created; NULL if not. * * * creation doesn ' t strdup so StRings Mustn ' t change later.
* * state* lookup (char *prefix[npref], int create) {int I, H;
State *sp;
H = hash (prefix); for (sp = statetab[h]; SP!= NULL; sp = Sp->next) {for (i = 0; i < npref; i++) if (strcmp (prefix[i), sp-&
Gt;pref[i])!= 0) break;
if (i = = npref)/* found it/return SP;
} if (create) {SP = (state *) Emalloc (sizeof);
for (i = 0; i < npref i++) sp->pref[i] = Prefix[i];
Sp->suf = NULL;
Sp->next = Statetab[h];
STATETAB[H] = SP;
} return SP; }/* Addsuffix:add to state.
Suffix must not change later/void Addsuffix (state *sp, char *suffix) {suffix *suf;
Suf = (Suffix *) emalloc (sizeof (Suffix));
Suf->word = suffix;
Suf->next = sp->suf;
Sp->suf = Suf;
}/* Add:add Word to suffix list, update prefix/void Add (char *prefix[npref), char *suffix) {State *sp; SP = lookup (prefix, 1);
/* Create if not found * * addsuffix (sp, suffix); /* Move The words down the prefix * * * memmove (prefix, prefix+1, (NPREF-1) *sizeof (prefix[0));
Prefix[npref-1] = suffix;
}/* Build:read input, build prefix table */void Build (char *prefix[npref], FILE *f) {char buf[100], fmt[10]; /* Create a format string;
%s could overflow buf * * sprintf (FMT, "%%%ds", sizeof (BUF)-1);
while (FSCANF (f, FMT, buf)!= EOF) Add (prefix, estrdup (BUF));
}/* Generate:produce output, one word per line */void Generate (int nwords) {State *sp;
Suffix *suf;
Char *prefix[npref], *w;
int I, Nmatch;
for (i = 0; i < npref i++) * Reset initial prefix/prefix[i] = Nonword;
for (i = 0; i < nwords i++) {sp = lookup (prefix, 0);
if (sp = = NULL) eprintf ("Internal error:lookup failed");
Nmatch = 0; for (Suf = sp->suf; suf!= NULL; suf = Suf->next) if (rand ()% ++nmatch = 0)/* prob = 1/nmatch/w
= suf->word; if (nmatch = 0) eprintf ("Internal error:no suffix%d%s", I, PREfix[0]);
if (strcmp (w, nonword) = = 0) break;
printf ("%s\n", W);
Memmove (prefix, prefix+1, (NPREF-1) *sizeof (prefix[0));
Prefix[npref-1] = W; }
}