Source analysis of distence.c file in Word2vec

Last Update:2015-04-09 Source: Internet

Author: User

Tags strcmp

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

#include <stdio.h>#include <string.h>#include <math.h>//#include <malloc.h>#include <stdlib.h>Const Long LongMax_size = -;//MAX length of stringsConst Long LongN =5;//number of closest words that'll be shownConst Long LongMax_w = -;//MAX length of vocabulary entriesintMainintargcChar**ARGV) {FILE *f;CharSt1[max_size];Char*bestw[n];an array of pointers, in size n, where each element points to a char-type pointer.   CharFile_name[max_size], st[ -][max_size];floatDist, Len, Bestd[n], vec[max_size];Long LongWords, size, a, B, C, D, CN, bi[ -];CharChfloat*m;Char*vocab;if(ARGC <2) {printf("Usage:./distance <file>\nwhere FILE contains word projections in the BINARY format\n");return 0; }strcpy(file_name, argv[1]); f = fopen (file_name,"RB");if(f = = NULL) {printf("Input File not found\n");return-1; }fscanf(F,"%lld", &words);//vocab_size  fscanf(F,"%lld", &size);//Number of dimensionsVocab = (Char*)malloc((Long Long) Words * max_w *sizeof(Char)); for(A =0; A < N; a++) Bestw[a] = (Char*)malloc(Max_size *sizeof(Char)); M = (float*)malloc((Long Long) Words * (Long Long) Size *sizeof(float));if(M = = NULL) {printf("Cannot allocate memory:%LLD MB%lld%lld\n", (Long Long) Words * size *sizeof(float) /1048576, words, size);return-1; } for(b =0; b < words; b++) {a =0; while(1) {Vocab[b * max_w + A] = Fgetc (f);if(feof (f) | | (VOCAB[B * max_w + a] = ="')) Break;if((A < Max_w) && (vocab[b * max_w + a]! =' \ n ')) a++; } vocab[b * Max_w + A] =0; for(A =0; a < size; a++) fread (&m[a + b * size],sizeof(float),1, f); Len =0; for(A =0; a < size;    a++) len + = M[a + b * Size] * m[a + b * size]; Len =sqrt(LEN); for(A =0; a < size; a++) M[a + b * size]/= len;//Normalization of coordinates} fclose (f); while(1) { for(A =0; A < N; a++) Bestd[a] =0; for(A =0; A < N; a++) bestw[a][0] =0;printf("Enter word or sentence (EXIT to break):"); A =0; while(1) {St1[a] = fgetc (stdin);if((st1[a] = =' \ n ') || (A >= max_size-1)) {St1[a] =0; Break;    } a++; }printf("st1:%s words:%lld \ n", st1,words);if(!strcmp(St1,"EXIT")) Break; CN =0; b =0; c =0; while(1) {each word in the ST is stored in the St1 two-dimensional array, with a total of CN. ST[CN][B] = St1[c];      b++;      C + +; ST[CN][B] =0;if(St1[c] = =0) Break;if(St1[c] = ="') {cn++; b =0;      C + +; }} cn++; for(A =0; A < CN; a++) { for(b =0; b < words; b++)if(!strcmp(&vocab[b * Max_w], st[a])) Break;if(b = = words) b =-1; Bi[a] = b;printf("\nword:%s Position in vocabulary:%lld\n", St[a], bi[a]);if(b = =-1) {printf("Out of dictionary word!\n"); Break;//As long as one word does not terminate the for loop in the glossary}    }if(b = =-1)Continue;printf("\ nthe Word cosine distance\n----------------------------------------- -------------------------------\ n "); for(A =0; a < size; a++) Vec[a] =0; for(b =0; B < cn; b++) {//traverse each word, if you enter multiple words Vec[a] is the summation of each word vector and      if(Bi[b] = =-1)Continue; for(A =0; a < size;    a++) Vec[a] + + m[a + bi[b] * size]; } len =0; for(A =0; a < size;    a++) len + = vec[a] * Vec[a]; Len =sqrt(LEN); for(A =0; a < size; a++) Vec[a]/= Len;//The VEC is normalized and does not work when only one word is entered.      for(A =0; A < N; a++) Bestd[a] =-1; for(A =0; A < N; a++) bestw[a][0] =0;//Because the word vectors of the query and glossary are normalized, the cosine similarity is equivalent to the inner product of the vector, and the larger the inner product is the more similar     for(c =0; c < words; C + +) {//Traverse glossaryA =0; for(b =0; B < cn; b++)//a: If the traversal word and the query word are the same, skip the word        if(Bi[b] = = c) A =1;if(A = =1)Continue; Dist =0; for(A =0; a < size; a++)//Find the inner product of the vectorDist + = vec[a] * m[a + c * size]; for(A =0; A < N; a++) {//Find the insertion position for Dist        if(Dist > Bestd[a]) { for(d = N-1; D > A; d--) {Bestd[d] = bestd[d-1];strcpy(Bestw[d], bestw[d-1]); } Bestd[a] = dist;strcpy(Bestw[a], &vocab[c * max_w]); Break; }      }    } for(A =0; A < N; a++)printf("%50s\t\t%f\n", Bestw[a], bestd[a]); }return 0;}

Source analysis of distence.c file in Word2vec

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More