馬爾可夫鏈演算法（markov演算法）的awk、C++、C語言實現代碼

馬爾可夫鏈演算法（markov演算法）的awk、C++、C語言實現代碼_C 語言

最後更新：2017-01-19 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

1. 問題描述

馬爾可夫鏈演算法用於產生一段隨機的英文，其思想非常簡單。首先讀入資料，然後將讀入的資料分成首碼和尾碼兩部分，通過首碼來隨機擷取尾碼，籍此產生一段可讀的隨機英文。

為了說明方便，假設我們有如下一段話：

複製代碼代碼如下:

Show your flowcharts and conceal your tables and I will be mystified. Show your tables and your flowcharts will be obvious.

假設首碼的長度為2,則我們處理輸入以後得到如下資料，我們首先擷取一個首碼，然後在首碼的尾碼列表中隨機播放一個單詞，然後改變首碼，重複上述過程，這樣，我們產生的句子將是可讀的。

下面是處理過的資料：

複製代碼代碼如下:

首碼尾碼
show your flowcharts tables
your flowcharts and will
flowcharts and conceal
flowcharts will be
your tables and and
will be mystified. obvious.
be mystified. show
be obvious. (end)

處理這個文本的馬爾可夫鏈演算法將首先帶引show your,然後隨機取出flowcharts 或者table 兩個單詞，假設選擇的是flowcharts, 則新的首碼就是your flowcharts,同理，選擇table 時，新的首碼就是your table,有了新的首碼your flowcharts 以後，再次隨即選擇它的尾碼，這次是在and 和 will 中隨機播放，重複上述過程，就能夠產生一段可讀的文本。具體描述如下：

複製代碼代碼如下:

設定 w1 和 w2 為文本的前兩個詞
輸出 w1 和 w2

迴圈：
    隨機地選出 w3,它是文本中 w1 w2 的尾碼中的一個
    列印 w3
    把 w1 和 w2 分別換成 w2 和 w3
    重複迴圈

2.awk 程式

馬爾可夫鏈演算法並不難，我們會在後面看到，用c語言來解決這個問題會相當麻煩，而用awk則只需要5分鐘就能搞定。這簡直就是一個示範awk優點的問題。

awk 中有關聯陣列，正好可以用來表示首碼和尾碼的關係。程式如下：

# markov.awk: markov chain algorithm for 2-word prefixesBEGIN { MAXGEN = 10000; NONWORD = "\n"; w1 = w2 = NONWORD }{  for (i = 1; i <= NF; i++) {   # read all words    statetab[w1,w2,++nsuffix[w1,w2]] = $i    w1 = w2    w2 = $i  }}END {  statetab[w1,w2,++nsuffix[w1,w2]] = NONWORD # add tail  w1 = w2 = NONWORD  for (i = 0; i < MAXGEN; i++) { # generate    r = int(rand()*nsuffix[w1,w2]) + 1 # nsuffix >= 1    p = statetab[w1,w2,r]    if (p == NONWORD)      exit    print p    w1 = w2     # advance chain    w2 = p  }}

3. C++ 程式

該問題的主要痛點就在於通過首碼隨機的擷取尾碼，在C++ 中，我們可以藉助map 來實現首碼和尾碼的對應關係，以此得到較高的開發效率。

/* Copyright (C) 1999 Lucent Technologies *//* Excerpted from 'The Practice of Programming' *//* by Brian W. Kernighan and Rob Pike */#include <time.h>#include <iostream>#include <string>#include <deque>#include <map>#include <vector>using namespace std;const int NPREF = 2;const char NONWORD[] = "\n";  // cannot appear as real line: we remove newlinesconst int MAXGEN = 10000; // maximum words generatedtypedef deque<string> Prefix;map<Prefix, vector<string> > statetab; // prefix -> suffixesvoid    build(Prefix&, istream&);void    generate(int nwords);void    add(Prefix&, const string&);// markov main: markov-chain random text generationint main(void){  int nwords = MAXGEN;  Prefix prefix; // current input prefix  srand(time(NULL));  for (int i = 0; i < NPREF; i++)    add(prefix, NONWORD);  build(prefix, cin);  add(prefix, NONWORD);  generate(nwords);  return 0;}// build: read input words, build state tablevoid build(Prefix& prefix, istream& in){  string buf;  while (in >> buf)    add(prefix, buf);}// add: add word to suffix deque, update prefixvoid add(Prefix& prefix, const string& s){  if (prefix.size() == NPREF) {    statetab[prefix].push_back(s);    prefix.pop_front();  }  prefix.push_back(s);}// generate: produce output, one word per linevoid generate(int nwords){  Prefix prefix;  int i;  for (i = 0; i < NPREF; i++)    add(prefix, NONWORD);  for (i = 0; i < nwords; i++) {    vector<string>& suf = statetab[prefix];    const string& w = suf[rand() % suf.size()];    if (w == NONWORD)      break;    cout << w << "\n";    prefix.pop_front(); // advance    prefix.push_back(w);  }}

4. c 程式

如果需要程式運行得足夠快，那就只能用較低級的語言來實現了。當我們用c 語言來實現時，就不得不考慮各種各樣的問題了。首先，面臨的第一個問題就是，如何表示首碼和尾碼的關係?

這裡採用首碼的key,尾碼為value 的方式儲存首碼與尾碼的關係，我們知道，hash表的尋找速度最快，所以，這裡採用hash表也是情理之中的事，只是看你能不能想到，用首碼作key,基於上面的思路，再仔細一點，就沒有什麼大問題了。

/* Copyright (C) 1999 Lucent Technologies *//* Excerpted from 'The Practice of Programming' *//* by Brian W. Kernighan and Rob Pike *//* * Markov chain random text generator. */#include <string.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <time.h>#include "eprintf.h"enum {  NPREF  = 2,  /* number of prefix words */  NHASH  = 4093, /* size of state hash table array */  MAXGEN = 10000 /* maximum words generated */};typedef struct State State;typedef struct Suffix Suffix;struct State { /* prefix + suffix list */  char  *pref[NPREF];  /* prefix words */  Suffix *suf;      /* list of suffixes */  State  *next;     /* next in hash table */};struct Suffix { /* list of suffixes */  char  *word;     /* suffix */  Suffix *next;     /* next in list of suffixes */};State  *lookup(char *prefix[], int create);void  build(char *prefix[], FILE*);void  generate(int nwords);void  add(char *prefix[], char *word);State  *statetab[NHASH];  /* hash table of states */char NONWORD[] = "\n"; /* cannot appear as real word *//* markov main: markov-chain random text generation */int main(void){  int i, nwords = MAXGEN;  char *prefix[NPREF];    /* current input prefix */  int c;  long seed;  setprogname("markov");  seed = time(NULL);  srand(seed);  for (i = 0; i < NPREF; i++) /* set up initial prefix */    prefix[i] = NONWORD;  build(prefix, stdin);  add(prefix, NONWORD);  generate(nwords);  return 0;}  const int MULTIPLIER = 31; /* for hash() *//* hash: compute hash value for array of NPREF strings */unsigned int hash(char *s[NPREF]){  unsigned int h;  unsigned char *p;  int i;  h = 0;  for (i = 0; i < NPREF; i++)    for (p = (unsigned char *) s[i]; *p != '\0'; p++)      h = MULTIPLIER * h + *p;  return h % NHASH;}/* lookup: search for prefix; create if requested. *//* returns pointer if present or created; NULL if not. *//* creation doesn't strdup so strings mustn't change later. */State* lookup(char *prefix[NPREF], int create){  int i, h;  State *sp;  h = hash(prefix);  for (sp = statetab[h]; sp != NULL; sp = sp->next) {    for (i = 0; i < NPREF; i++)      if (strcmp(prefix[i], sp->pref[i]) != 0)        break;    if (i == NPREF)   /* found it */      return sp;  }  if (create) {    sp = (State *) emalloc(sizeof(State));    for (i = 0; i < NPREF; i++)      sp->pref[i] = prefix[i];    sp->suf = NULL;    sp->next = statetab[h];    statetab[h] = sp;  }  return sp;}/* addsuffix: add to state. suffix must not change later */void addsuffix(State *sp, char *suffix){  Suffix *suf;  suf = (Suffix *) emalloc(sizeof(Suffix));  suf->word = suffix;  suf->next = sp->suf;  sp->suf = suf;}/* add: add word to suffix list, update prefix */void add(char *prefix[NPREF], char *suffix){  State *sp;  sp = lookup(prefix, 1); /* create if not found */  addsuffix(sp, suffix);  /* move the words down the prefix */  memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));  prefix[NPREF-1] = suffix;}/* build: read input, build prefix table */void build(char *prefix[NPREF], FILE *f){  char buf[100], fmt[10];  /* create a format string; %s could overflow buf */  sprintf(fmt, "%%%ds", sizeof(buf)-1);  while (fscanf(f, fmt, buf) != EOF)    add(prefix, estrdup(buf));}/* generate: produce output, one word per line */void generate(int nwords){  State *sp;  Suffix *suf;  char *prefix[NPREF], *w;  int i, nmatch;  for (i = 0; i < NPREF; i++) /* reset initial prefix */    prefix[i] = NONWORD;  for (i = 0; i < nwords; i++) {    sp = lookup(prefix, 0);    if (sp == NULL)      eprintf("internal error: lookup failed");    nmatch = 0;    for (suf = sp->suf; suf != NULL; suf = suf->next)      if (rand() % ++nmatch == 0) /* prob = 1/nmatch */        w = suf->word;    if (nmatch == 0)      eprintf("internal error: no suffix %d %s", i, prefix[0]);    if (strcmp(w, NONWORD) == 0)      break;    printf("%s\n", w);    memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));    prefix[NPREF-1] = w;  }}

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More