This mimics the idea of building a hash index inside WORD2VEC.
#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include <stdlib.h>#include <time.h>#include <pthread.h>#define Maxstring#define Maxusersize 1024*1024*10#define Similarity_item#define MaxLenCharfilename[ -][ the]={"Output1.txt","Output2.txt","Output3.txt","Output4.txt","Output5.txt","Output6.txt","Output7.txt","Output8.txt","Output9.txt","Output10.txt","Output11.txt","Output12.txt","Output13.txt","Output14.txt","Output15.txt","Output16.txt","Output17.txt","Output18.txt","Output19.txt","Output20.txt"};Const intItem_hash_size =30000000;structiteminfo{CharItemid[maxstring];intTotaluser;unsigned Long Long* USERLIST;intMax_user;};unsigned Long LongMax_item = +, item_size=0;structItemInfo * Item;int* Item_hash;Charstr1[10000][ -];intnum_threads= -;Charstr2[ -];intGetwordhash (Char*itemid) {unsigned Long LongA, hash =0; for(A =0; A <strlen(ITEMID); a++) hash = hash *257+ Itemid[a]; hash = hash% item_hash_size;returnhash;}intSearchitem (Char*itemid) {unsigned inthash = Getwordhash (itemId); while(1) {if(Item_hash[hash] = =-1)return-1;if(!strcmp(ItemId, Item[item_hash[hash]].itemid))returnItem_hash[hash]; hash = (hash +1)% Item_hash_size; }return-1;}intAdditemidtoitem (Char*itemid,unsigned Long LongUSERID) {unsigned intHash Item[item_size].max_user = -; Item[item_size].userlist = (unsigned Long Long*)calloc(Item[item_size].max_user,sizeof(unsigned Long Long));strcpy(Item[item_size].itemid, itemId); item[item_size].userlist[0]=userid; Item[item_size].totaluser =1; item_size++;//Reallocate memory if needed if(Item_size +2>= Max_item) {Max_item + = +; Item = (structItemInfo *) realloc (item, Max_item *sizeof(structItemInfo)); } hash = Getwordhash (itemId); while(Item_hash[hash]! =-1) hash = (hash +1)% Item_hash_size; Item_hash[hash] = item_size-1;returnItem_size-1;}intReaditeminfo () {intA//Open FileFILE * fin = fopen ("Data_1w.txt","RB");if(Fin==null) {printf("The input file doesn ' t exist.\n");Exit(1); } item = (structItemInfo *)malloc(max_item*sizeof(structItemInfo));if(Item==null) {printf("Item Allocate failed.\n");Exit(1); }//Read user's purchase record Char* Str= (Char*)malloc(maxusersize); while(Fgets (str, maxusersize,fin)! = NULL) {//Segment the user purchase record memset(STR1,0,sizeof(STR1));intCN =0;intb =0;intc =0; while(1) {Str1[cn][b] = Str[c]; b++; C + +; STR1[CN][B] =0;if(Str[c] = =Ten) Break;if(Str[c] = ="') {cn++; b =0; C + +; }} cn++;//Remove the UserID from U intLen =strlen(str1[0]);unsigned Long LongValue=0; for(intj=1; J < Len; J + +) value = value*Ten+ (str1[0][j]-' 0 ');//Add Itemid to item for(inti =1; I < CN; ++i) {intindex = Searchitem (Str1[i]);if(Index = =-1) A = Additemidtoitem (Str1[i],value);Else{item[index].totaluser++;if(item[index].totaluser+2>=item[index].max_user) {item[index].max_user+= -; Item[index].userlist = (unsigned Long Long*) realloc (item[index].userlist, Item[index].max_user *sizeof(unsigned Long Long)); } item[index].userlist[item[index].totaluser-1]=value; } } }return 0;}voidInit () {//Initialize hash tableItem_hash = (int*)calloc(Item_hash_size,sizeof(int)); for(inti =0; i < item_hash_size; ++i) Item_hash[i] =-1;//Allocate space for itemItem = (structItemInfo *)malloc(max_item*sizeof(structItemInfo));}intBinary_search (unsigned Long Long Array[],intNunsigned Long LongValue) {intleft=0;intright=n-1; while(Left<=right)//cycle conditions, timely and change{intMiddle=left + ((right-left) >>1);//Prevent overflow, shift is also more efficient. At the same time, each cycle needs to be updated. if(Array[Middle]>value] {right =middle-1;//right assignment, timely and change}Else if(Array[Middle]<value) {left=middle+1; }Else returnMiddle; }return-1; }void* Calitemsim (void* a) {FILE * Fout = fopen (filename[(int) A],"W");floatBestsim[similarity_item];Charbestuserid[similarity_item][ -];floatp,similarity;intCommon,pos;intleft = Item_size/num_threads * (intAintright = Item_size/num_threads * (int) A +1)-1;if((int) a==num_threads-1) right=item_size-1; for(inti = left; I <= right; ++i)//Traverse this thread to process the item{ for(intW =0; W < Similarity_item; ++W)//Initialize{bestsim[w]=-1; bestuserid[w][0]=0; } for(intj =0; J < Item_size; ++J) {if(I!=J) {common=0; for(intt =0; T < Item[i].totaluser; ++T)//Search for the same number of items purchased{Pos=binary_search (item[j].userlist,item[j].totaluser,item[i].userlist[t]);if(pos!=-1) common++; }if(common>5) {p=sqrt(Item[i].totaluser * item[j].totaluser); similarity = common/p; for(intK =0; K < Similarity_item; ++K) {if(Similarity>bestsim[k]) { for(intQ = Similarity_item-1; Q > k; q--) {Bestsim[q] = bestsim[q-1];strcpy(bestuserid[q],bestuserid[q-1]); } Bestsim[k] =similarity;strcpy(Bestuserid[k],item[j].itemid); Break; } } } } } for(intc =0; (c < Similarity_item) &&bestsim[c]!=-1; ++C) {fprintf(Fout,'%s%s%f\n ', Item[i].itemid,bestuserid[c],bestsim[c]); }} fclose (Fout); Pthread_exit (NULL);}voidCreatmulthread () {pthread_t *pt = (pthread_t *)malloc(Num_threads *sizeof(pthread_t)); for(Long LongA =0; A < num_threads; a++) pthread_create (&pt[a], NULL, Calitemsim, (void*) a); for(Long LongA =0; A < num_threads; a++) Pthread_join (Pt[a], NULL); }//Combine multiple output files intovoidFilejoin () {FILE * Fout = fopen ("Output.txt","W"); for(inti =0; I < -; ++i) {FILE * fin = fopen (Filename[i],"R"); while(Fgets (STR2, maxlen,fin)! = NULL) {fprintf(Fout,'%s ', str2); } fclose (Fin);if(Remove (filename[i]) = =0)printf("Removed%s\n", Filename[i]);ElsePerror ("Remove"); } fclose (Fout);}intMain () {//freopen ("Output.txt", "w", stdout);Init (); Readiteminfo (); Creatmulthread (); Filejoin ();/ * Char testpid[50]= "p535223"; int Test=searchitem (TESTPID); printf ("test:%d\n", test); printf ("item_size:%llu\n", item_size); printf ("itemid:%s\n", Item[test].itemid); printf ("totaluser:%d\n", Item[test].totaluser); for (int i = 0; i < Item[test].totaluser; ++i) {printf ("%llu\n", Item[test].userlist[i]); } */ return 0;}
C-language implementation of the multithreaded version of ITEM_KNN