Cuda Learning: First CUDA code: Array summation

Source: Internet
Author: User
Tags goto

Today we have a few gains, successfully running the array summation code: Just add the number of n sum

Environment: cuda5.0,vs2010

#include "cuda_runtime.h"
#include "Device_launch_parameters.h"

#include <stdio.h>

cudaerror_t Addwithcuda (int *c, int *a);


#define TOTALN 72120
#define Blocks_pergrid 32
#define THREADS_PERBLOCK 64//2^8

__global__ void Sumarray (int *c, int *a)//, int *b)
{
__shared__ unsigned int mycache[threads_perblock];//sets the shared memory within each block threadsperblock==blockdim.x

int i = threadidx.x+blockidx.x*blockdim.x;
Int J = griddim.x*blockdim.x;//number of threads in each grid
int cachen;
unsigned sum,k;

sum=0;

cachen=threadidx.x; //

while (I<TOTALN)
{
Sum + = a[i];//+ b[i];
i = i+j;
}

Mycache[cachen]=sum;

__syncthreads ();//Synchronize thread blocks, wait for all threads in the block to calculate end


The sum (saved in mycache) for each thread in this block is calculated below and
Recursive method: (Refer to "GPU High performance programming Cuda combat Chinese")
1: Thread is half-added:

k=threads_perblock>>1;
while (k)
{
if (cachen<k)
{
Thread number less than half the thread continues to run here Plus
Mycache[cachen] + = mycache[cachen+k];//array sequence half plus, get results, put to the first half of the array, for the next recursive preparation
}
__syncthreads ();//Synchronize thread blocks, wait for all threads in the block to calculate end
k=k>>1;//array sequence, continue half, prepare for the back recursion
}

The last recursion is done in thread 0 of the block, and all the results from thread 0 are returned to the CPU
if (cachen==0)
{
C[BLOCKIDX.X]=MYCACHE[0];
}


}

int main ()
{

int A[TOTALN];
int C[blocks_pergrid];

unsigned int J;
for (j=0;j<totaln;j++)
{
Initialize the array, you can fill in the data yourself, I use 1
A[j]=1;
}

To sum in parallel
cudaerror_t cudastatus = Addwithcuda (c, a);

if (cudastatus! = cudasuccess) {
fprintf (stderr, "Addwithcuda failed!");
return 1;
}

unsigned int sum1,sum2;
sum1=0;
for (j=0;j<blocks_pergrid;j++)
{
Sum1 +=c[j];
}
CPU verified and correct

sum2=0;
for (j=0;j<totaln;j++)
{
Sum2 + = A[j];
}

printf ("sum1=%d; Sum2=%d\n ", sum1,sum2);

Cudadevicereset must is called before exiting in order for profiling and
Tracing tools such as Nsight and Visual Profiler to show complete traces.
Cudastatus = Cudadevicereset ();
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudadevicereset failed!");
return 1;
}

return 0;
}

Helper function for using CUDA to add vectors in parallel.

cudaerror_t Addwithcuda (int *c, int *a)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaerror_t Cudastatus;

Choose which GPU to run on, the change this on a MULTI-GPU system.
Cudastatus = Cudasetdevice (0);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudasetdevice failed! Do you have a cuda-capable GPU installed? ");
Goto Error;
}

Request a GPU memory space that is the same length as the C array in the main function
Cudastatus = Cudamalloc ((void**) &dev_c, Blocks_pergrid * sizeof (int));
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudamalloc failed!");
Goto Error;
}
Request a GPU memory space that is the same length as the a array in the main function
Cudastatus = Cudamalloc ((void**) &dev_a, Totaln * sizeof (int));
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudamalloc failed!");
Goto Error;
}

//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
Copy input vectors from the host memory to GPU buffers.
Copy A's data from the CPU to the GPU
Cudastatus = cudamemcpy (Dev_a, A, TOTALN * sizeof (int), cudamemcpyhosttodevice);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "cudamemcpy failed!");
Goto Error;
}


//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////


Launch a kernel on the GPU with one thread for each element.
Threads that start each cell on the GPU
Sumarray<<<blocks_pergrid, Threads_perblock>>> (Dev_c, dev_a);//, Dev_b);

Cudadevicesynchronize waits for the kernel to finish, and returns
Any errors encountered during the launch.
Wait for all threads to run end
Cudastatus = Cudadevicesynchronize ();
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudadevicesynchronize returned error code%d after launching Addkernel!\n", cudastatus);
Goto Error;
}

Copy output vector from the GPU buffer to host memory.
Cudastatus = cudamemcpy (c, Dev_c, Blocks_pergrid * sizeof (int), cudamemcpydevicetohost);
Cudastatus = cudamemcpy (b, dev_b, size * sizeof (int), cudamemcpydevicetohost);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "cudamemcpy failed!");
Goto Error;
}

Error:
Cudafree (Dev_c);
Cudafree (dev_a);


return cudastatus;
}

Cuda Learning: First CUDA code: Array summation

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.