Introduction to Ubuntu 16.04 Development Cuda Program (i) _

Introduction to Ubuntu 16.04 Development Cuda Program (i) __cuda

Last Update:2018-08-20 Source: Internet

Author: User

Tags rand

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Introduction to Ubuntu 16.04 Development Cuda Program (i)Environment: Ubuntu 16.04+nvidia-smi 378.13+cmake 3.5.1+cuda 8.0+kdevelop 4.7.3 Environment ConfigurationNvidia driver, CMake, Cuda configuration method See: Ubuntu 16.04 Configuration Run kintinuous kdevelop configuration: command line input sudo apt-get install Reference DocumentsLiu Jinxian and so on. Parallel programming based on Cuda. Science press. 2014 Linux enables CMake to compile cuda:http://blog.csdn.net/u012839187/article/details/45887737. CUDA Example:/home/luhaiyan/nvidia_cuda-8.0_samples/0_simple/vectoradd/vectoradd.cu array Addition-Program codeOpen KDevelop, New project, "New from Template ..."-"Standard"-"Terminal", "Application Name:" To fill in "Cuda_test", "Location:" for Default " Home/luhaiyan/projects ". Under Cuda_test project new file "Test_cuda_fun.cu", "test_cuda_fun.cu" file content is [2][3]:

#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h>//device-side code __global__ void Vectoradd  
(const float *a, const float *b, float *c, int numelements)

    {int i = blockdim.x * blockidx.x + threadidx.x;
    if (I < numelements) {C[i] = A[i] + b[i];  }//host-side code extern "C" int func ()//note here defines the form {///Error code to check back values for CUDA calls cudaerror_t

    err = cudasuccess;
    Print the vector length to is used, and compute its size int numelements = 3;
    size_t size = numelements * sizeof (float);

    printf ("[Vector addition of%d elements]\n", numelements);

    Allocate the host input vector A float *h_a = (float *) malloc (size);

    Allocate the host input vector B float *h_b = (float *) malloc (size);

    Allocate the host output vector C float *h_c = (float *) malloc (size); Verify that allocations succeeded if (h_a = null | | h_b = NULL | | h_c = NULL) {FPRintf (stderr, "Failed to allocate host vectors!\n");
    Exit (Exit_failure);
    printf ("Index h_a h_b\n"); Initialize the host input vectors for (int i = 0; i < numelements ++i) {H_a[i] = rand ()/(float) RAN
        D_max;
    H_b[i] = rand ()/(float) Rand_max;
    printf ("Index%d:%f%f\n", I,h_a[i],h_b[i]);

    printf ("\ n");
    Allocate the device input vector A float *d_a = NULL;  Err = Cudamalloc ((void *) &d_a, size);//Allocate one-dimensional linear storage space if (err!= cudasuccess) {fprintf (stderr, "Failed")
        To allocate device vector A (error code%s)!\n ", cudageterrorstring (err));
    Exit (Exit_failure);
    }//Allocate the device input vector B float *d_b = NULL;

    Err = Cudamalloc (void * *) &d_b, size); if (Err!= cudasuccess) {fprintf (stderr, "Failed to allocate device vector B (error code%s)!\n", Cudageterro
        Rstring (err));
    Exit (Exit_failure); }//Allocate the device output Vector C float *d_c = NULL;

    Err = Cudamalloc (void * *) &d_c, size); if (Err!= cudasuccess) {fprintf (stderr, "Failed to allocate device vector C (error code%s)!\n", Cudageterro
        Rstring (err));
    Exit (Exit_failure); }//Copy the host input vectors A and B in host memory to the device input vectors in//device memory Print
    F ("Copy input data from the host memory to the CUDA device\n");
        Err = cudamemcpy (d_a, h_a, size, cudamemcpyhosttodevice);//Transfer data from a one-dimensional linear memory from the host side to the device side if (Err!= cudasuccess) {
        fprintf (stderr, "Failed to copy vector A from host to device (error code%s)!\n", cudageterrorstring (err));
    Exit (Exit_failure);

    Err = cudamemcpy (d_b, h_b, size, cudamemcpyhosttodevice); if (Err!= cudasuccess) {fprintf (stderr, "Failed to copy vector B from host to device (error code%s)!\n", CU
        Dageterrorstring (err));
    Exit (Exit_failure);
}//Launch the Vector Add CUDA Kernel    int threadsperblock = 256;
    int Blockspergrid = (numelements + threadsPerBlock-1)/threadsperblock;
    printf ("CUDA kernel launch with%d blocks of%d threads\n", Blockspergrid, Threadsperblock);
    Vectoradd<<<blockspergrid, Threadsperblock>>> (d_a, D_b, D_c, numelements);

    Err = Cudagetlasterror (); if (Err!= cudasuccess) {fprintf (stderr, Failed to launch Vectoradd (error code%s) kernel,!\n
        String (err));
    Exit (Exit_failure);
    //Copy The device result vector of device memory to the host result vector//in host memory.
    printf ("Copy output data from the CUDA device to the host memory\n");

    Err = cudamemcpy (H_c, D_c, size, cudamemcpydevicetohost); if (Err!= cudasuccess) {fprintf (stderr, "Failed to copy vector C-device to host (error code%s)!\n", CU
        Dageterrorstring (err));
    Exit (Exit_failure); }//Verify that's the result vector are correct for (int i = 0; i < numelements; ++i) {if (Fabs (h_a[i) + h_b[i]-h_c[i]) > 1e-5) {fprintf (stderr, "result Verificatio
            N Failed at Element%d!\n ", i);
        Exit (Exit_failure);

    } printf ("Test passed\n\n");
    printf ("vectoradd_result:\n");
    for (int i=0;i<numelements;i++) printf ("Index%d:%f\n", I,h_c[i]);
    printf ("\ n");

    Free device global Memory Err = Cudafree (d_a); if (Err!= cudasuccess) {fprintf (stderr, "Failed to free device vector A (error code%s)!\n", Cudageterrorstr
        ing (err));
    Exit (Exit_failure);

    Err = Cudafree (d_b); if (Err!= cudasuccess) {fprintf (stderr, "Failed to free device vector B (error code%s)!\n", Cudageterrorstr
        ing (err));
    Exit (Exit_failure);

    Err = Cudafree (D_c); if (Err!= cudasuccess) {fprintf (stderr, "Failed to free device vector C (error code%s)!\n", Cudageterrorstr
        ing (err)); Exit (Exit_fAilure);
    }//Free host memory free (H_A);
    Free (h_b);

    Free (H_c);
    printf ("done\n");
return 0; }

The contents of the "main.cpp" file are:

#include <iostream>    
using namespace std;      
extern "C" int func (); Notice the declaration here    
Int main ()    
{    
    func ();    
    return 0;    
}

The contents of the "CMakeLists.txt" file are:

Cmake_minimum_required (VERSION 2.6)
project (cuda_test)

find_package (cuda required)
Include_ Directories (${cuda_include_dirs})

cuda_add_executable (Test_cuda main.cpp test_cuda_fun.cu)

Right-click the "Cuda_test" project and click "Build"

Overall project results after build

Command Line Input

CD '/home/luhaiyan/projects/cuda_test/build ' 
./test_cuda

Run Result:

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More