編程珠璣單詞中,需要統計《聖經》一書中,出現頻率最高的單詞,這是典型的TOP K問題,採用經典的Hash + 堆方法解決,代碼如下:
#include <stdio.h>#include <string.h>#include <malloc.h>#include <assert.h>#include <stdlib.h>#include <ctime>#define HASHLEN 101#define WORDLEN 30#define MAX 100000#define DOMAIN 300#define K 10//Hash鏈表的節點定義typedef struct node{char *word;int count;struct node *next;} node, *node_ptr;static node_ptr head[HASHLEN];static node array[K];//Hash函數int hash_function(char *p){unsigned int value = 0;while (*p != '\0'){value = value * 31 + *p++;if (value > HASHLEN)value = value % HASHLEN;}return value;}//加入節點到HASH鏈表void append_word(char *str){int index = hash_function(str);node_ptr p = head[index];while (p != NULL){if (strcmp(str, p->word) == 0){(p->count)++;return;}p = p->next;}// 建立一個結點node_ptr q = (node_ptr) malloc(sizeof(node));q->count = 1;q->word = (char *) malloc(sizeof(str) + 1);strcpy(q->word, str);//insert into the list headq->next = head[index];head[index] = q;}//產生0~DOMAIN - 1範圍內的MAX個整數void gen_data(){FILE *fp = fopen("c://data1.txt", "w");assert(fp);int i = 0;srand((int) (time(0)));for (i = 0; i < MAX; i++)fprintf(fp, "%d ", rand() % DOMAIN);fclose(fp);}//堆調整:調整為最小堆void heapAdjust(node array[], int beginIndex, int endIndex, int index){int length = endIndex - beginIndex + 1;int largestIndex = index;int leftIndex = 2 * index + 1; //下標從0開始,可以自己做實驗int rightIndex = 2 * index + 2;if (leftIndex <= length - 1&& array[leftIndex].count <= array[largestIndex].count){largestIndex = leftIndex;}if (rightIndex <= length - 1&& array[rightIndex].count <= array[largestIndex].count){largestIndex = rightIndex;}if (largestIndex != index){node temp = array[largestIndex];array[largestIndex] = array[index];array[index] = temp;heapAdjust(array, beginIndex, endIndex, largestIndex);}}//建堆void heapBuild(node array[], int len){int i = 0;for (i = len / 2 - 1; i >= 0; i--){heapAdjust(array, 0, len - 1, i);}return ;}int main(){gen_data();char str[WORDLEN];int i;int cnt1 = 0;//initfor (i = 0; i < HASHLEN; i++)head[i] = NULL;FILE *fp_passage = fopen("c://data1.txt", "r");assert(fp_passage);while (fscanf(fp_passage, "%s", str) != EOF){cnt1++;append_word(str);}printf("the cnt1 is %d\n", cnt1);fclose(fp_passage);//尋找Top Kfor (i = 0; i < HASHLEN; i++){if (i < K - 1)array[i] = *head[i];else if (i == K - 1){array[i] = *head[i];heapBuild(array, K);}else{if (array[0].count < head[i]->count){array[0] = *head[i];heapAdjust(array, 0, K - 1, 0);}}}printf("the top %d is as follows\n", K);for (i = 0; i < K; i++)printf("%s , and its count is %d\n", array[i].word, array[i].count);//printf("the total number of word is %d",cnt);return 0;}
代碼中核心代碼有:
1,雜湊散列函數
2,調整堆函數
關鍵資料結構:
適應hash的結構體,包含字串,統計資訊,指標