Introduction to Ubuntu 16.04 Development Cuda Program (i) __cuda

Source: Internet
Author: User
Tags rand
Introduction to Ubuntu 16.04 Development Cuda Program (i)Environment: Ubuntu 16.04+nvidia-smi 378.13+cmake 3.5.1+cuda 8.0+kdevelop 4.7.3 Environment ConfigurationNvidia driver, CMake, Cuda configuration method See: Ubuntu 16.04 Configuration Run kintinuous kdevelop configuration: command line input sudo apt-get install Reference DocumentsLiu Jinxian and so on. Parallel programming based on Cuda. Science press. 2014 Linux enables CMake to compile cuda:http://blog.csdn.net/u012839187/article/details/45887737. CUDA Example:/home/luhaiyan/nvidia_cuda-8.0_samples/0_simple/vectoradd/vectoradd.cu array Addition-Program codeOpen KDevelop, New project, "New from Template ..."-"Standard"-"Terminal", "Application Name:" To fill in "Cuda_test", "Location:" for Default " Home/luhaiyan/projects ". Under Cuda_test project new file "Test_cuda_fun.cu", "test_cuda_fun.cu" file content is [2][3]:
#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h>//device-side code __global__ void Vectoradd  
(const float *a, const float *b, float *c, int numelements)

    {int i = blockdim.x * blockidx.x + threadidx.x;
    if (I < numelements) {C[i] = A[i] + b[i];  }//host-side code extern "C" int func ()//note here defines the form {///Error code to check back values for CUDA calls cudaerror_t

    err = cudasuccess;
    Print the vector length to is used, and compute its size int numelements = 3;
    size_t size = numelements * sizeof (float);

    printf ("[Vector addition of%d elements]\n", numelements);

    Allocate the host input vector A float *h_a = (float *) malloc (size);

    Allocate the host input vector B float *h_b = (float *) malloc (size);

    Allocate the host output vector C float *h_c = (float *) malloc (size); Verify that allocations succeeded if (h_a = null | | h_b = NULL | | h_c = NULL) {FPRintf (stderr, "Failed to allocate host vectors!\n");
    Exit (Exit_failure);
    printf ("Index h_a h_b\n"); Initialize the host input vectors for (int i = 0; i < numelements ++i) {H_a[i] = rand ()/(float) RAN
        D_max;
    H_b[i] = rand ()/(float) Rand_max;
    printf ("Index%d:%f%f\n", I,h_a[i],h_b[i]);

    printf ("\ n");
    Allocate the device input vector A float *d_a = NULL;  Err = Cudamalloc ((void *) &d_a, size);//Allocate one-dimensional linear storage space if (err!= cudasuccess) {fprintf (stderr, "Failed")
        To allocate device vector A (error code%s)!\n ", cudageterrorstring (err));
    Exit (Exit_failure);
    }//Allocate the device input vector B float *d_b = NULL;

    Err = Cudamalloc (void * *) &d_b, size); if (Err!= cudasuccess) {fprintf (stderr, "Failed to allocate device vector B (error code%s)!\n", Cudageterro
        Rstring (err));
    Exit (Exit_failure); }//Allocate the device output Vector C float *d_c = NULL;

    Err = Cudamalloc (void * *) &d_c, size); if (Err!= cudasuccess) {fprintf (stderr, "Failed to allocate device vector C (error code%s)!\n", Cudageterro
        Rstring (err));
    Exit (Exit_failure); }//Copy the host input vectors A and B in host memory to the device input vectors in//device memory Print
    F ("Copy input data from the host memory to the CUDA device\n");
        Err = cudamemcpy (d_a, h_a, size, cudamemcpyhosttodevice);//Transfer data from a one-dimensional linear memory from the host side to the device side if (Err!= cudasuccess) {
        fprintf (stderr, "Failed to copy vector A from host to device (error code%s)!\n", cudageterrorstring (err));
    Exit (Exit_failure);

    Err = cudamemcpy (d_b, h_b, size, cudamemcpyhosttodevice); if (Err!= cudasuccess) {fprintf (stderr, "Failed to copy vector B from host to device (error code%s)!\n", CU
        Dageterrorstring (err));
    Exit (Exit_failure);
}//Launch the Vector Add CUDA Kernel    int threadsperblock = 256;
    int Blockspergrid = (numelements + threadsPerBlock-1)/threadsperblock;
    printf ("CUDA kernel launch with%d blocks of%d threads\n", Blockspergrid, Threadsperblock);
    Vectoradd<<<blockspergrid, Threadsperblock>>> (d_a, D_b, D_c, numelements);

    Err = Cudagetlasterror (); if (Err!= cudasuccess) {fprintf (stderr, Failed to launch Vectoradd (error code%s) kernel,!\n
        String (err));
    Exit (Exit_failure);
    //Copy The device result vector of device memory to the host result vector//in host memory.
    printf ("Copy output data from the CUDA device to the host memory\n");

    Err = cudamemcpy (H_c, D_c, size, cudamemcpydevicetohost); if (Err!= cudasuccess) {fprintf (stderr, "Failed to copy vector C-device to host (error code%s)!\n", CU
        Dageterrorstring (err));
    Exit (Exit_failure); }//Verify that's the result vector are correct for (int i = 0; i < numelements; ++i) {if (Fabs (h_a[i) + h_b[i]-h_c[i]) > 1e-5) {fprintf (stderr, "result Verificatio
            N Failed at Element%d!\n ", i);
        Exit (Exit_failure);

    } printf ("Test passed\n\n");
    printf ("vectoradd_result:\n");
    for (int i=0;i<numelements;i++) printf ("Index%d:%f\n", I,h_c[i]);
    printf ("\ n");

    Free device global Memory Err = Cudafree (d_a); if (Err!= cudasuccess) {fprintf (stderr, "Failed to free device vector A (error code%s)!\n", Cudageterrorstr
        ing (err));
    Exit (Exit_failure);

    Err = Cudafree (d_b); if (Err!= cudasuccess) {fprintf (stderr, "Failed to free device vector B (error code%s)!\n", Cudageterrorstr
        ing (err));
    Exit (Exit_failure);

    Err = Cudafree (D_c); if (Err!= cudasuccess) {fprintf (stderr, "Failed to free device vector C (error code%s)!\n", Cudageterrorstr
        ing (err)); Exit (Exit_fAilure);
    }//Free host memory free (H_A);
    Free (h_b);

    Free (H_c);
    printf ("done\n");
return 0; }
The contents of the "main.cpp" file are:
#include <iostream>    
using namespace std;      
extern "C" int func (); Notice the declaration here    
Int main ()    
{    
    func ();    
    return 0;    
}    
The contents of the "CMakeLists.txt" file are:
Cmake_minimum_required (VERSION 2.6)
project (cuda_test)

find_package (cuda required)
Include_ Directories (${cuda_include_dirs})

cuda_add_executable (Test_cuda main.cpp test_cuda_fun.cu)  

Right-click the "Cuda_test" project and click "Build"

Overall project results after build

Command Line Input

CD '/home/luhaiyan/projects/cuda_test/build ' 
./test_cuda
Run Result:

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.