Today we have a few gains, successfully running the array summation code: Just add the number of n sum
Environment: cuda5.0,vs2010
#include "cuda_runtime.h"
#include "Device_launch_parameters.h"
#include <stdio.h>
cudaerror_t Addwithcuda (int *c, int *a);
#define TOTALN 72120
#define Blocks_pergrid 32
#define THREADS_PERBLOCK 64//2^8
__global__ void Sumarray (int *c, int *a)//, int *b)
{
__shared__ unsigned int mycache[threads_perblock];//sets the shared memory within each block threadsperblock==blockdim.x
int i = threadidx.x+blockidx.x*blockdim.x;
Int J = griddim.x*blockdim.x;//number of threads in each grid
int cachen;
unsigned sum,k;
sum=0;
cachen=threadidx.x; //
while (I<TOTALN)
{
Sum + = a[i];//+ b[i];
i = i+j;
}
Mycache[cachen]=sum;
__syncthreads ();//Synchronize thread blocks, wait for all threads in the block to calculate end
The sum (saved in mycache) for each thread in this block is calculated below and
Recursive method: (Refer to "GPU High performance programming Cuda combat Chinese")
1: Thread is half-added:
k=threads_perblock>>1;
while (k)
{
if (cachen<k)
{
Thread number less than half the thread continues to run here Plus
Mycache[cachen] + = mycache[cachen+k];//array sequence half plus, get results, put to the first half of the array, for the next recursive preparation
}
__syncthreads ();//Synchronize thread blocks, wait for all threads in the block to calculate end
k=k>>1;//array sequence, continue half, prepare for the back recursion
}
The last recursion is done in thread 0 of the block, and all the results from thread 0 are returned to the CPU
if (cachen==0)
{
C[BLOCKIDX.X]=MYCACHE[0];
}
}
int main ()
{
int A[TOTALN];
int C[blocks_pergrid];
unsigned int J;
for (j=0;j<totaln;j++)
{
Initialize the array, you can fill in the data yourself, I use 1
A[j]=1;
}
To sum in parallel
cudaerror_t cudastatus = Addwithcuda (c, a);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Addwithcuda failed!");
return 1;
}
unsigned int sum1,sum2;
sum1=0;
for (j=0;j<blocks_pergrid;j++)
{
Sum1 +=c[j];
}
CPU verified and correct
sum2=0;
for (j=0;j<totaln;j++)
{
Sum2 + = A[j];
}
printf ("sum1=%d; Sum2=%d\n ", sum1,sum2);
Cudadevicereset must is called before exiting in order for profiling and
Tracing tools such as Nsight and Visual Profiler to show complete traces.
Cudastatus = Cudadevicereset ();
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudadevicereset failed!");
return 1;
}
return 0;
}
Helper function for using CUDA to add vectors in parallel.
cudaerror_t Addwithcuda (int *c, int *a)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaerror_t Cudastatus;
Choose which GPU to run on, the change this on a MULTI-GPU system.
Cudastatus = Cudasetdevice (0);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudasetdevice failed! Do you have a cuda-capable GPU installed? ");
Goto Error;
}
Request a GPU memory space that is the same length as the C array in the main function
Cudastatus = Cudamalloc ((void**) &dev_c, Blocks_pergrid * sizeof (int));
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudamalloc failed!");
Goto Error;
}
Request a GPU memory space that is the same length as the a array in the main function
Cudastatus = Cudamalloc ((void**) &dev_a, Totaln * sizeof (int));
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudamalloc failed!");
Goto Error;
}
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
Copy input vectors from the host memory to GPU buffers.
Copy A's data from the CPU to the GPU
Cudastatus = cudamemcpy (Dev_a, A, TOTALN * sizeof (int), cudamemcpyhosttodevice);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "cudamemcpy failed!");
Goto Error;
}
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
Launch a kernel on the GPU with one thread for each element.
Threads that start each cell on the GPU
Sumarray<<<blocks_pergrid, Threads_perblock>>> (Dev_c, dev_a);//, Dev_b);
Cudadevicesynchronize waits for the kernel to finish, and returns
Any errors encountered during the launch.
Wait for all threads to run end
Cudastatus = Cudadevicesynchronize ();
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudadevicesynchronize returned error code%d after launching Addkernel!\n", cudastatus);
Goto Error;
}
Copy output vector from the GPU buffer to host memory.
Cudastatus = cudamemcpy (c, Dev_c, Blocks_pergrid * sizeof (int), cudamemcpydevicetohost);
Cudastatus = cudamemcpy (b, dev_b, size * sizeof (int), cudamemcpydevicetohost);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "cudamemcpy failed!");
Goto Error;
}
Error:
Cudafree (Dev_c);
Cudafree (dev_a);
return cudastatus;
}
Cuda Learning: First CUDA code: Array summation