Project Package Download
When a vector element exceeds the number of threads
Number of vector elements (=2.x)/(x *)
1 /*2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.3 *4 * NVIDIA Corporation and its licensors retain all intellectual property and5 * Proprietary rights in and to this software and related documentation.6 * Any use, reproduction, disclosure, or distribution of this software7 * and related documentation without an express license agreement from8 * NVIDIA Corporation is strictly prohibited.9 *Ten * Refer to the applicable NVIDIA End User License Agreement (EULA) One * Associated with this source code for terms and conditions that govern A * Your use of the This NVIDIA software. - * - */ the - -#include".. /common/book.h" -#include"cuda.h" +#include"cuda_runtime.h" -#include"Device_launch_parameters.h" + A #defineN (33 * 1024) at -__global__voidAddint*a,int*b,int*c) { - intTid = threadidx.x + blockidx.x *blockdim.x; - while(Tid <N) { -C[tid] = A[tid] +B[tid]; -Tid + = blockdim.x *griddim.x; in } - } to + intMainvoid) { - int*a, *b, *C; the int*dev_a, *dev_b, *Dev_c; * $ //allocate the memory on the CPUPanax NotoginsengA = (int*) malloc (N *sizeof(int)); -B = (int*) malloc (N *sizeof(int)); thec = (int*) malloc (N *sizeof(int)); + A //allocate the memory on the GPU theHandle_error (Cudamalloc (void* *) &dev_a, N *sizeof(int))); +Handle_error (Cudamalloc (void* *) &dev_b, N *sizeof(int))); -Handle_error (Cudamalloc (void* *) &dev_c, N *sizeof(int))); $ $ //fill the arrays ' a ' and ' B ' on the CPU - for(inti =0; i<n; i++) { -A[i] =i; theB[i] =2*i; - }Wuyi the //Copy the Arrays ' a ' and ' B ' to the GPU -Handle_error (cudamemcpy (Dev_a, A, N *sizeof(int), Wu cudamemcpyhosttodevice)); -Handle_error (cudamemcpy (Dev_b, B, N *sizeof(int), About cudamemcpyhosttodevice)); $ - /* - when a vector element exceeds the number of threads - number of vector elements (=2.x)/(x *) A */ +Add << < -, ->> >(Dev_a, Dev_b, Dev_c); the - //Copy the array ' C ' back from the GPU to the CPU $Handle_error (cudamemcpy (c, Dev_c, N *sizeof(int), the cudamemcpydevicetohost)); the the //Verify that the GPU does the work we requested the BOOLSuccess =true; - for(inti =0; i<n; i++) { in if((A[i] + b[i])! =C[i]) { theprintf"Error:%d +%d! =%d\n", A[i], b[i], c[i]); theSuccess =false; About } the } the if(success) printf ("We did it!\n"); the + //The memory we allocated on the GPU - Handle_error (Cudafree (dev_a)); the Handle_error (Cudafree (Dev_b));Bayi Handle_error (Cudafree (Dev_c)); the the //Free the memory we allocated on the CPU - Free (a); - Free (b); the Free (c); the the return 0; the}
The processing case when the number of elements exceeds the number of threads in Cuda