Array in reverse order, the array initialized on the host is transmitted to the device, and then the Cuda parallel reverse order is used. At this time, the operation is performed on the global memory, and then the result is returned to the host for verification.
1 # include <stdio. h> 2 # include <assert. h> 3 # include "Cuda. H "4 # include" cuda_runtime.h "5 # include" device_launch_parameters.h "6 // check for errors during Cuda running 7 void checkcudaerror (const char * MSG); 8 // Part3: implement the kernel 9/* 10 Number of threads in blockdim Block 11 Number of blockidx block indexes in the blockidx mesh 12 number of blocks in the griddim mesh 13 threadidx block thread indexes 14 */15 _ global _ _ void reversearrayblock (int * d_out, int * d_in) 16 {17 int inoffset = blockdim. x * blocki DX. x; 18 int outoffset = blockdim. x * (griddim. x-1-blockidx. x); 19 int in = inoffset + threadidx. x; 20 int out = outoffset + (blockdim. x-1-threadidx. x); 21 d_out [out] = d_in [in]; 22} 23 ///////////////////////////////////// /// // 24 // main function 25 // //////////////////////////////////////// /// // 26 int main (INT argc, char ** argv) 27 {28 // point to the host's memory space and size 29 int * H_a; 30 int Dima = 256*1024; // 256 k elements (1 MB total) 31 // pointer to the device and size 32 int * d_ B, * d_a; 33 // defines the mesh and block size. The number of threads for each block is 34 int numthreadsperblock = 256; 35 36/* 37 calculate the number of required Blocks Based on the array size and preset block size 38 */39 int numblocks = Dima/numthreadsperblock; 40 // apply for storage space on the host and device 41 size_t memsize = numblocks * numthreadsperblock * sizeof (INT); 42 // The size on the host 43 h_a = (int *) malloc (memsize); 44 // the size of the device 45 cudamalloc (void **) & d_a, memsiz E); 46 cudamalloc (void **) & d_ B, memsize); 47 // initialize the input array on the host 48 for (INT I = 0; I <Dima; ++ I) 49 {50 h_a [I] = I; 51} 52 // copy the host array to the device, h_a --> d_a53 cudamemcpy (d_a, h_a, memsize, cudamemcpyhosttodevice); 54 // start kernel 55 dim3 dimgrid (numblocks); 56 dim3 dimblock (numthreadsperblock); 57 reversearrayblock <dimgrid, dimblock> (d_ B, d_a ); 58 // blocking until the device completes computation 59 cudathreadsynchronize (); 60 // check whether the device has generated an error 61 // check for any Cuda Error 62 checkcudaerror ("kernel invocation"); 63 // copy the result from the device to the host, d_ B --> h_a64 cudamemcpy (h_a, d_ B, memsize, cudamemcpydevicetohost ); 65 // check for any Cuda error 66 checkcudaerror ("memcpy"); 67 // check whether the results returned to the host are correct 68 for (INT I = 0; I <Dima; I ++) 69 {70 assert (h_a [I] = Dima-1-I); 71} 72 // release the device memory 73 cudafree (d_a ); 74 cudafree (d_ B); 75 // release host memory 76 free (h_a); 77 printf ("correct! \ N "); 78 return 0; 79} 80 void checkcudaerror (const char * MSG) 81 {82 cudaerror_t err = cudagetlasterror (); 83 If (cudasuccess! = ERR) 84 {85 fprintf (stderr, "Cuda error: % s: % S. \ n ", MSG, cudageterrorstring (ERR); 86 exit (exit_failure); 87} 88}
Cuda implements array Reverse Order