#include <stdio.h>#include <string.h>#include <math.h>//#include <malloc.h>#include <stdlib.h>Const Long LongMax_size = -;//MAX length of stringsConst Long LongN =5;//number of closest words that'll be shownConst Long LongMax_w = -;//MAX length of vocabulary entriesintMainintargcChar**ARGV) {FILE *f;CharSt1[max_size];Char*bestw[n];an array of pointers, in size n, where each element points to a char-type pointer. CharFile_name[max_size], st[ -][max_size];floatDist, Len, Bestd[n], vec[max_size];Long LongWords, size, a, B, C, D, CN, bi[ -];CharChfloat*m;Char*vocab;if(ARGC <2) {printf("Usage:./distance <file>\nwhere FILE contains word projections in the BINARY format\n");return 0; }strcpy(file_name, argv[1]); f = fopen (file_name,"RB");if(f = = NULL) {printf("Input File not found\n");return-1; }fscanf(F,"%lld", &words);//vocab_size fscanf(F,"%lld", &size);//Number of dimensionsVocab = (Char*)malloc((Long Long) Words * max_w *sizeof(Char)); for(A =0; A < N; a++) Bestw[a] = (Char*)malloc(Max_size *sizeof(Char)); M = (float*)malloc((Long Long) Words * (Long Long) Size *sizeof(float));if(M = = NULL) {printf("Cannot allocate memory:%LLD MB%lld%lld\n", (Long Long) Words * size *sizeof(float) /1048576, words, size);return-1; } for(b =0; b < words; b++) {a =0; while(1) {Vocab[b * max_w + A] = Fgetc (f);if(feof (f) | | (VOCAB[B * max_w + a] = ="')) Break;if((A < Max_w) && (vocab[b * max_w + a]! =' \ n ')) a++; } vocab[b * Max_w + A] =0; for(A =0; a < size; a++) fread (&m[a + b * size],sizeof(float),1, f); Len =0; for(A =0; a < size; a++) len + = M[a + b * Size] * m[a + b * size]; Len =sqrt(LEN); for(A =0; a < size; a++) M[a + b * size]/= len;//Normalization of coordinates} fclose (f); while(1) { for(A =0; A < N; a++) Bestd[a] =0; for(A =0; A < N; a++) bestw[a][0] =0;printf("Enter word or sentence (EXIT to break):"); A =0; while(1) {St1[a] = fgetc (stdin);if((st1[a] = =' \ n ') || (A >= max_size-1)) {St1[a] =0; Break; } a++; }printf("st1:%s words:%lld \ n", st1,words);if(!strcmp(St1,"EXIT")) Break; CN =0; b =0; c =0; while(1) {each word in the ST is stored in the St1 two-dimensional array, with a total of CN. ST[CN][B] = St1[c]; b++; C + +; ST[CN][B] =0;if(St1[c] = =0) Break;if(St1[c] = ="') {cn++; b =0; C + +; }} cn++; for(A =0; A < CN; a++) { for(b =0; b < words; b++)if(!strcmp(&vocab[b * Max_w], st[a])) Break;if(b = = words) b =-1; Bi[a] = b;printf("\nword:%s Position in vocabulary:%lld\n", St[a], bi[a]);if(b = =-1) {printf("Out of dictionary word!\n"); Break;//As long as one word does not terminate the for loop in the glossary} }if(b = =-1)Continue;printf("\ nthe Word cosine distance\n----------------------------------------- -------------------------------\ n "); for(A =0; a < size; a++) Vec[a] =0; for(b =0; B < cn; b++) {//traverse each word, if you enter multiple words Vec[a] is the summation of each word vector and if(Bi[b] = =-1)Continue; for(A =0; a < size; a++) Vec[a] + + m[a + bi[b] * size]; } len =0; for(A =0; a < size; a++) len + = vec[a] * Vec[a]; Len =sqrt(LEN); for(A =0; a < size; a++) Vec[a]/= Len;//The VEC is normalized and does not work when only one word is entered. for(A =0; A < N; a++) Bestd[a] =-1; for(A =0; A < N; a++) bestw[a][0] =0;//Because the word vectors of the query and glossary are normalized, the cosine similarity is equivalent to the inner product of the vector, and the larger the inner product is the more similar for(c =0; c < words; C + +) {//Traverse glossaryA =0; for(b =0; B < cn; b++)//a: If the traversal word and the query word are the same, skip the word if(Bi[b] = = c) A =1;if(A = =1)Continue; Dist =0; for(A =0; a < size; a++)//Find the inner product of the vectorDist + = vec[a] * m[a + c * size]; for(A =0; A < N; a++) {//Find the insertion position for Dist if(Dist > Bestd[a]) { for(d = N-1; D > A; d--) {Bestd[d] = bestd[d-1];strcpy(Bestw[d], bestw[d-1]); } Bestd[a] = dist;strcpy(Bestw[a], &vocab[c * max_w]); Break; } } } for(A =0; A < N; a++)printf("%50s\t\t%f\n", Bestw[a], bestd[a]); }return 0;}
Source analysis of distence.c file in Word2vec