Introduction to Ubuntu 16.04 Development Cuda Program (i)Environment: Ubuntu 16.04+nvidia-smi 378.13+cmake 3.5.1+cuda 8.0+kdevelop 4.7.3
Environment ConfigurationNvidia driver, CMake, Cuda configuration method See: Ubuntu 16.04 Configuration Run kintinuous kdevelop configuration: command line input sudo apt-get install
Reference DocumentsLiu Jinxian and so on. Parallel programming based on Cuda. Science press. 2014 Linux enables CMake to compile cuda:http://blog.csdn.net/u012839187/article/details/45887737. CUDA Example:/home/luhaiyan/nvidia_cuda-8.0_samples/0_simple/vectoradd/vectoradd.cu
array Addition-Program codeOpen KDevelop, New project, "New from Template ..."-"Standard"-"Terminal", "Application Name:" To fill in "Cuda_test", "Location:" for Default " Home/luhaiyan/projects ". Under Cuda_test project new file "Test_cuda_fun.cu", "test_cuda_fun.cu" file content is [2][3]:
#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h>//device-side code __global__ void Vectoradd
(const float *a, const float *b, float *c, int numelements)
{int i = blockdim.x * blockidx.x + threadidx.x;
if (I < numelements) {C[i] = A[i] + b[i]; }//host-side code extern "C" int func ()//note here defines the form {///Error code to check back values for CUDA calls cudaerror_t
err = cudasuccess;
Print the vector length to is used, and compute its size int numelements = 3;
size_t size = numelements * sizeof (float);
printf ("[Vector addition of%d elements]\n", numelements);
Allocate the host input vector A float *h_a = (float *) malloc (size);
Allocate the host input vector B float *h_b = (float *) malloc (size);
Allocate the host output vector C float *h_c = (float *) malloc (size); Verify that allocations succeeded if (h_a = null | | h_b = NULL | | h_c = NULL) {FPRintf (stderr, "Failed to allocate host vectors!\n");
Exit (Exit_failure);
printf ("Index h_a h_b\n"); Initialize the host input vectors for (int i = 0; i < numelements ++i) {H_a[i] = rand ()/(float) RAN
D_max;
H_b[i] = rand ()/(float) Rand_max;
printf ("Index%d:%f%f\n", I,h_a[i],h_b[i]);
printf ("\ n");
Allocate the device input vector A float *d_a = NULL; Err = Cudamalloc ((void *) &d_a, size);//Allocate one-dimensional linear storage space if (err!= cudasuccess) {fprintf (stderr, "Failed")
To allocate device vector A (error code%s)!\n ", cudageterrorstring (err));
Exit (Exit_failure);
}//Allocate the device input vector B float *d_b = NULL;
Err = Cudamalloc (void * *) &d_b, size); if (Err!= cudasuccess) {fprintf (stderr, "Failed to allocate device vector B (error code%s)!\n", Cudageterro
Rstring (err));
Exit (Exit_failure); }//Allocate the device output Vector C float *d_c = NULL;
Err = Cudamalloc (void * *) &d_c, size); if (Err!= cudasuccess) {fprintf (stderr, "Failed to allocate device vector C (error code%s)!\n", Cudageterro
Rstring (err));
Exit (Exit_failure); }//Copy the host input vectors A and B in host memory to the device input vectors in//device memory Print
F ("Copy input data from the host memory to the CUDA device\n");
Err = cudamemcpy (d_a, h_a, size, cudamemcpyhosttodevice);//Transfer data from a one-dimensional linear memory from the host side to the device side if (Err!= cudasuccess) {
fprintf (stderr, "Failed to copy vector A from host to device (error code%s)!\n", cudageterrorstring (err));
Exit (Exit_failure);
Err = cudamemcpy (d_b, h_b, size, cudamemcpyhosttodevice); if (Err!= cudasuccess) {fprintf (stderr, "Failed to copy vector B from host to device (error code%s)!\n", CU
Dageterrorstring (err));
Exit (Exit_failure);
}//Launch the Vector Add CUDA Kernel int threadsperblock = 256;
int Blockspergrid = (numelements + threadsPerBlock-1)/threadsperblock;
printf ("CUDA kernel launch with%d blocks of%d threads\n", Blockspergrid, Threadsperblock);
Vectoradd<<<blockspergrid, Threadsperblock>>> (d_a, D_b, D_c, numelements);
Err = Cudagetlasterror (); if (Err!= cudasuccess) {fprintf (stderr, Failed to launch Vectoradd (error code%s) kernel,!\n
String (err));
Exit (Exit_failure);
//Copy The device result vector of device memory to the host result vector//in host memory.
printf ("Copy output data from the CUDA device to the host memory\n");
Err = cudamemcpy (H_c, D_c, size, cudamemcpydevicetohost); if (Err!= cudasuccess) {fprintf (stderr, "Failed to copy vector C-device to host (error code%s)!\n", CU
Dageterrorstring (err));
Exit (Exit_failure); }//Verify that's the result vector are correct for (int i = 0; i < numelements; ++i) {if (Fabs (h_a[i) + h_b[i]-h_c[i]) > 1e-5) {fprintf (stderr, "result Verificatio
N Failed at Element%d!\n ", i);
Exit (Exit_failure);
} printf ("Test passed\n\n");
printf ("vectoradd_result:\n");
for (int i=0;i<numelements;i++) printf ("Index%d:%f\n", I,h_c[i]);
printf ("\ n");
Free device global Memory Err = Cudafree (d_a); if (Err!= cudasuccess) {fprintf (stderr, "Failed to free device vector A (error code%s)!\n", Cudageterrorstr
ing (err));
Exit (Exit_failure);
Err = Cudafree (d_b); if (Err!= cudasuccess) {fprintf (stderr, "Failed to free device vector B (error code%s)!\n", Cudageterrorstr
ing (err));
Exit (Exit_failure);
Err = Cudafree (D_c); if (Err!= cudasuccess) {fprintf (stderr, "Failed to free device vector C (error code%s)!\n", Cudageterrorstr
ing (err)); Exit (Exit_fAilure);
}//Free host memory free (H_A);
Free (h_b);
Free (H_c);
printf ("done\n");
return 0; }
The contents of the "main.cpp" file are:
#include <iostream>
using namespace std;
extern "C" int func (); Notice the declaration here
Int main ()
{
func ();
return 0;
}
The contents of the "CMakeLists.txt" file are:
Cmake_minimum_required (VERSION 2.6)
project (cuda_test)
find_package (cuda required)
Include_ Directories (${cuda_include_dirs})
cuda_add_executable (Test_cuda main.cpp test_cuda_fun.cu)
Right-click the "Cuda_test" project and click "Build"
Overall project results after build
Command Line Input
CD '/home/luhaiyan/projects/cuda_test/build '
./test_cuda
Run Result: