Experiment ideas:
After the data is initialized on the host end, it is transmitted to the device end. The device and the host end perform the same operation to add 1 to the data, and then the result of the device is transmitted to the host, finally, check whether the host computing result and the device computing result are consistent.
1 // incrementarray. cu 2 3 # include <stdio. h> 4 # include <assert. h> 5 # include "Cuda. H "6 # include" cuda_runtime.h "7 # include" device_launch_parameters.h "8 9/* 10 host-side functions 11 */12 Void incrementarrayonhost (float * a, int N) 13 {14 int I; 15 for (I = 0; I <n; I ++) A [I] = A [I] + 1.f; 16} 17 18/* 19 device-side functions (kernel functions) 20 */21 _ global _ void incrementarrayondevice (float * a, int N) 22 {23 int idx = blockidx. x * blockd Im. X + threadidx. x; 24 if (idx <n) A [idx] = A [idx] + 1.f; 25} 26 27 int main (void) 28 {29 float * a_h, * B _h; // pointer to host 30 float * a_d; // pointer to device 31 32 int I, n = 10; 33 size_t size = N * sizeof (float ); 34 35 // host application memory 36 a_h = (float *) malloc (size); 37 B _h = (float *) malloc (size ); 38 39 // The device requests to store 40 cudamalloc (void **) & a_d, size); 41 42 // The host data is initialized 43 for (I = 0; I <n; I ++) a_h [I] = (float) I; 44 45 // copy data from the host to the device: _ H --> a_d46 cudamemcpy (a_d, a_h, sizeof (float) * n, cudamemcpyhosttodevice); 47 48 // compute 49 incrementarrayonhost (a_h, n) on the host side ); 50 51 // calculate 52 on the device // The block size is 4, Part 1 of 2. compute execution configuration53 int blocksize = 4; 54 // if n % blocksize is 0, the number of required blocks is N/blocksize, otherwise, add the remaining 55 int nblocks = N/blocksize + (n % blocksize = 0? 0: 1); 56 57 // Part 2 of 2. call the kernel function. nblocks is the number of required blocks, and blocksize is 58 incrementarrayondevice <nblocks, blocksize> (a_d, n ); 59 60 // data is transmitted from the device to the host and stored in B _h 61 cudamemcpy (B _h, a_d, sizeof (float) * n, cudamemcpydevicetohost ); 62 63 // check result 64 for (I = 0; I <n; I ++) assert (a_h [I] = B _h [I]); 65 66 // release host memory and device memory 67 free (a_h); 68 free (B _h); 69 cudafree (a_d); 70}
Test environment:
Win7 + vs2013 + cuda6.5
Download link
Wake up "incrementarray" on the host and device, and compare the result