Source analysis of distence.c file in Word2vec

Source: Internet
Author: User
Tags strcmp

#include <stdio.h>#include <string.h>#include <math.h>//#include <malloc.h>#include <stdlib.h>Const Long LongMax_size = -;//MAX length of stringsConst Long LongN =5;//number of closest words that'll be shownConst Long LongMax_w = -;//MAX length of vocabulary entriesintMainintargcChar**ARGV) {FILE *f;CharSt1[max_size];Char*bestw[n];an array of pointers, in size n, where each element points to a char-type pointer.   CharFile_name[max_size], st[ -][max_size];floatDist, Len, Bestd[n], vec[max_size];Long LongWords, size, a, B, C, D, CN, bi[ -];CharChfloat*m;Char*vocab;if(ARGC <2) {printf("Usage:./distance <file>\nwhere FILE contains word projections in the BINARY format\n");return 0; }strcpy(file_name, argv[1]); f = fopen (file_name,"RB");if(f = = NULL) {printf("Input File not found\n");return-1; }fscanf(F,"%lld", &words);//vocab_size  fscanf(F,"%lld", &size);//Number of dimensionsVocab = (Char*)malloc((Long Long) Words * max_w *sizeof(Char)); for(A =0; A < N; a++) Bestw[a] = (Char*)malloc(Max_size *sizeof(Char)); M = (float*)malloc((Long Long) Words * (Long Long) Size *sizeof(float));if(M = = NULL) {printf("Cannot allocate memory:%LLD MB%lld%lld\n", (Long Long) Words * size *sizeof(float) /1048576, words, size);return-1; } for(b =0; b < words; b++) {a =0; while(1) {Vocab[b * max_w + A] = Fgetc (f);if(feof (f) | | (VOCAB[B * max_w + a] = ="')) Break;if((A < Max_w) && (vocab[b * max_w + a]! =' \ n ')) a++; } vocab[b * Max_w + A] =0; for(A =0; a < size; a++) fread (&m[a + b * size],sizeof(float),1, f); Len =0; for(A =0; a < size;    a++) len + = M[a + b * Size] * m[a + b * size]; Len =sqrt(LEN); for(A =0; a < size; a++) M[a + b * size]/= len;//Normalization of coordinates} fclose (f); while(1) { for(A =0; A < N; a++) Bestd[a] =0; for(A =0; A < N; a++) bestw[a][0] =0;printf("Enter word or sentence (EXIT to break):"); A =0; while(1) {St1[a] = fgetc (stdin);if((st1[a] = =' \ n ') || (A >= max_size-1)) {St1[a] =0; Break;    } a++; }printf("st1:%s words:%lld \ n", st1,words);if(!strcmp(St1,"EXIT")) Break; CN =0; b =0; c =0; while(1) {each word in the ST is stored in the St1 two-dimensional array, with a total of CN. ST[CN][B] = St1[c];      b++;      C + +; ST[CN][B] =0;if(St1[c] = =0) Break;if(St1[c] = ="') {cn++; b =0;      C + +; }} cn++; for(A =0; A < CN; a++) { for(b =0; b < words; b++)if(!strcmp(&vocab[b * Max_w], st[a])) Break;if(b = = words) b =-1; Bi[a] = b;printf("\nword:%s Position in vocabulary:%lld\n", St[a], bi[a]);if(b = =-1) {printf("Out of dictionary word!\n"); Break;//As long as one word does not terminate the for loop in the glossary}    }if(b = =-1)Continue;printf("\ nthe Word cosine distance\n----------------------------------------- -------------------------------\ n "); for(A =0; a < size; a++) Vec[a] =0; for(b =0; B < cn; b++) {//traverse each word, if you enter multiple words Vec[a] is the summation of each word vector and      if(Bi[b] = =-1)Continue; for(A =0; a < size;    a++) Vec[a] + + m[a + bi[b] * size]; } len =0; for(A =0; a < size;    a++) len + = vec[a] * Vec[a]; Len =sqrt(LEN); for(A =0; a < size; a++) Vec[a]/= Len;//The VEC is normalized and does not work when only one word is entered.      for(A =0; A < N; a++) Bestd[a] =-1; for(A =0; A < N; a++) bestw[a][0] =0;//Because the word vectors of the query and glossary are normalized, the cosine similarity is equivalent to the inner product of the vector, and the larger the inner product is the more similar     for(c =0; c < words; C + +) {//Traverse glossaryA =0; for(b =0; B < cn; b++)//a: If the traversal word and the query word are the same, skip the word        if(Bi[b] = = c) A =1;if(A = =1)Continue; Dist =0; for(A =0; a < size; a++)//Find the inner product of the vectorDist + = vec[a] * m[a + c * size]; for(A =0; A < N; a++) {//Find the insertion position for Dist        if(Dist > Bestd[a]) { for(d = N-1; D > A; d--) {Bestd[d] = bestd[d-1];strcpy(Bestw[d], bestw[d-1]); } Bestd[a] = dist;strcpy(Bestw[a], &vocab[c * max_w]); Break; }      }    } for(A =0; A < N; a++)printf("%50s\t\t%f\n", Bestw[a], bestd[a]); }return 0;}

Source analysis of distence.c file in Word2vec

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.