Cuda Learning: First CUDA code: Array summation

Last Update:2015-10-04 Source: Internet

Author: User

Tags goto

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Today we have a few gains, successfully running the array summation code: Just add the number of n sum

Environment: cuda5.0,vs2010

#include "cuda_runtime.h"
#include "Device_launch_parameters.h"

#include <stdio.h>

cudaerror_t Addwithcuda (int *c, int *a);

#define TOTALN 72120
#define Blocks_pergrid 32
#define THREADS_PERBLOCK 64//2^8

__global__ void Sumarray (int *c, int *a)//, int *b)
{
__shared__ unsigned int mycache[threads_perblock];//sets the shared memory within each block threadsperblock==blockdim.x

int i = threadidx.x+blockidx.x*blockdim.x;
Int J = griddim.x*blockdim.x;//number of threads in each grid
int cachen;
unsigned sum,k;

sum=0;

cachen=threadidx.x; //

while (I<TOTALN)
{
Sum + = a[i];//+ b[i];
i = i+j;
}

Mycache[cachen]=sum;

__syncthreads ();//Synchronize thread blocks, wait for all threads in the block to calculate end

The sum (saved in mycache) for each thread in this block is calculated below and
Recursive method: (Refer to "GPU High performance programming Cuda combat Chinese")
1: Thread is half-added:

k=threads_perblock>>1;
while (k)
{
if (cachen<k)
{
Thread number less than half the thread continues to run here Plus
Mycache[cachen] + = mycache[cachen+k];//array sequence half plus, get results, put to the first half of the array, for the next recursive preparation
}
__syncthreads ();//Synchronize thread blocks, wait for all threads in the block to calculate end
k=k>>1;//array sequence, continue half, prepare for the back recursion
}

The last recursion is done in thread 0 of the block, and all the results from thread 0 are returned to the CPU
if (cachen==0)
{
C[BLOCKIDX.X]=MYCACHE[0];
}

}

int main ()
{

int A[TOTALN];
int C[blocks_pergrid];

unsigned int J;
for (j=0;j<totaln;j++)
{
Initialize the array, you can fill in the data yourself, I use 1
A[j]=1;
}

To sum in parallel
cudaerror_t cudastatus = Addwithcuda (c, a);

if (cudastatus! = cudasuccess) {
fprintf (stderr, "Addwithcuda failed!");
return 1;
}

unsigned int sum1,sum2;
sum1=0;
for (j=0;j<blocks_pergrid;j++)
{
Sum1 +=c[j];
}
CPU verified and correct

sum2=0;
for (j=0;j<totaln;j++)
{
Sum2 + = A[j];
}

printf ("sum1=%d; Sum2=%d\n ", sum1,sum2);

Cudadevicereset must is called before exiting in order for profiling and
Tracing tools such as Nsight and Visual Profiler to show complete traces.
Cudastatus = Cudadevicereset ();
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudadevicereset failed!");
return 1;
}

return 0;
}

Helper function for using CUDA to add vectors in parallel.

cudaerror_t Addwithcuda (int *c, int *a)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaerror_t Cudastatus;

Choose which GPU to run on, the change this on a MULTI-GPU system.
Cudastatus = Cudasetdevice (0);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudasetdevice failed! Do you have a cuda-capable GPU installed? ");
Goto Error;
}

Request a GPU memory space that is the same length as the C array in the main function
Cudastatus = Cudamalloc ((void**) &dev_c, Blocks_pergrid * sizeof (int));
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudamalloc failed!");
Goto Error;
}
Request a GPU memory space that is the same length as the a array in the main function
Cudastatus = Cudamalloc ((void**) &dev_a, Totaln * sizeof (int));
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudamalloc failed!");
Goto Error;
}

//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
Copy input vectors from the host memory to GPU buffers.
Copy A's data from the CPU to the GPU
Cudastatus = cudamemcpy (Dev_a, A, TOTALN * sizeof (int), cudamemcpyhosttodevice);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "cudamemcpy failed!");
Goto Error;
}

Launch a kernel on the GPU with one thread for each element.
Threads that start each cell on the GPU
Sumarray<<<blocks_pergrid, Threads_perblock>>> (Dev_c, dev_a);//, Dev_b);

Cudadevicesynchronize waits for the kernel to finish, and returns
Any errors encountered during the launch.
Wait for all threads to run end
Cudastatus = Cudadevicesynchronize ();
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudadevicesynchronize returned error code%d after launching Addkernel!\n", cudastatus);
Goto Error;
}

Copy output vector from the GPU buffer to host memory.
Cudastatus = cudamemcpy (c, Dev_c, Blocks_pergrid * sizeof (int), cudamemcpydevicetohost);
Cudastatus = cudamemcpy (b, dev_b, size * sizeof (int), cudamemcpydevicetohost);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "cudamemcpy failed!");
Goto Error;
}

Error:
Cudafree (Dev_c);
Cudafree (dev_a);

return cudastatus;
}

Cuda Learning: First CUDA code: Array summation

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More