How to perform a parallel calculation of blocks in a grid after loading data into the GPU (a grid contains more than one block)
/****how Do we run code in parallel on the device****/ /****use block****/_global_voidAddint*a,int*b,int*c) {c[blockidx.x]= a[blockidx.x] +b[blockidx.x];}#defineN 512intMain () {int*a, *b, *c;//host copies of A, B, C int*d_a, *d_b, *d_c;//device copies of A, B, C intSize = N *sizeof(int); //Alloc space for device copies of A, B, CCudamalloc ((void* *) &d_a, size); Cudamalloc ((void* *) &d_b, size); Cudamalloc ((void* *) &D_c, size); //Alloc Space for host copies of A, B, c and setup input valuesA = (int*) malloc (size); Random_ints (A, N); b= (int*) malloc (size); Random_ints (b, N); C= (int*) malloc (size); //Copy the data into devicecudememcpy (D_a, A, size, cudamemcpyhosttodevice); cudamemcpy (d_b, B, size, cudamemcpyhosttodevice); //Launch Add () kernel on GPU with N blocksAdd<<<n,1>>>(d_a, D_b, D_c); //Copy result back to hostcudamemcpy (c, D_c, size, cudamemcpydevicetohost); //CleanupFree ( a), free (b), free (c); Cudefree (d_a); Cudafree (d_b); Cudafree (D_c); return 0;}/** * * what ' s the function of random_ints****/voidRandom_ints (intAintN) { inti; for(i =0; i < N; ++i) a[i]=rand ();}
Cuda Programming Learning (II.)