The same version of the code with so many times, a little sorry, so this time I want to make a larger change, we have to eyes peeled, wait and see.
Block parallelism is equivalent to multiple processes in the operating system, and the previous section described the concept of Cuda wired Group (thread block), which organizes a set of threads together, allocates a subset of the resources, and then dispatches the execution internally. There is no relationship between the line Cheng and the line Cheng. This facilitates more coarse-grained parallelism. We will change the code in the previous section to the block parallel version as follows:
[CPP] View plain copy print? #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> Cudaerror_t addwithcuda (int *c, const int *a, const int *b, size_t size); __global__ void addkernel (int *c, CONST&NBSP;INT&NBSP;*A,&NBSP;CONST&NBSP;INT&NBSP;*B) { <span style= " background-color: #ff0000 "> int i = blockIdx.x; </ span> c[i] = a[i] + b[i]; } Int main ( ) { const int arraySize = 5; const int a[arraysize] = { 1, 2, 3, 4, 5 }; &NBSP;&NBSP;&NBSP;&NBSP;CONST&NBSP;INT&NBSP;B[ARRAYSIZE]&NBSP;=&NBsp { 10, 20, 30, 40, 50 }; int c[arraysize] = { 0 }; // Add vectors in parallel. cudaError_t cudaStatus; int num = 0; cudaDeviceProp prop; cudastatus = cudagetdevicecount (&num); for ( int i = 0;i<num;i++) { cudagetdeviceproperties (&prop,i); } cudastatus = addwithcuda (c, a, b, arraysize); if (cudastatus != cudasuccess) { fprintf (stderr, "addwithcuda failed!"); return 1; } printf ("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d ,%d}\n ", c[0],c[1],c[2],c[3],c[4]); // cudathreadexit must be called before exiting in order for profiling and // tracing tools such as nsight and visual profiler to show complete traces. cudaStatus = Cudathreadexit (); if (cudastatus != cudasuccess) { fprintf (stderr, "Cudathreadexit failed! "); return 1; } return 0; } // helper function for using cuda to add vectors in parallel. cudaError_t Addwithcuda (Int *c, const int *a, const int *b, size_t size) { int *dev_a = 0; int *dev_b = 0;