Cuda Learning Note Two

Source: Internet
Author: User

The simple vector Plus


/** * Vector addition:c = a + B. * * This sample was A very basic sample that implements element by element * Vector Addit Ion. It is the same as the sample illustrating Chapter 2 * of the Programming Guide with some additions like error checking. */#include <stdio.h>//for the CUDA runtime routines (prefixed with "Cuda_") #include <cuda_runtime.h>/** * CUD A Kernel Device code * * Computes the vector addition of a and B into C. The 3 vectors has the same * number of elements numelements. */__global__ voidvectoradd (const float *a, const float *b, float *c, int numelements) {int i = blockdim.x * blockidx.x + threadidx.x;//COMPUTE thread index//printf ("Thread index:%d\n", i),//cuda 2.0 above support kernel function printing, of course, when the number of threads is much easier to cause some questions <span style= " White-space:pre "></span>//questions.    So I usually debug the number of threads in the entire grid to 1, and then Debug.    if (I < numelements) {C[i] = A[i] + b[i]; }}/** * Host main routine */intmain (void) {//Error code to check return values for CUDA calls cudaerror_t err = cud Asuccess;    Print the vector length to being used, and compute its size int numelements = 50000;    size_t size = numelements * sizeof (float);    printf ("[Vector addition of%d elements]\n", numelements);    Allocate the host input vector A float *h_a = (float *) malloc (size);    Allocate the host input vector B float *h_b = (float *) malloc (size);    Allocate the host output vector C float *h_c = (float *) malloc (size);  Verify that allocations succeeded if (h_a = = NULL | | h_b = NULL | | h_c = = NULL) {fprintf (stderr, "Failed        To allocate host vectors!\n ");    Exit (Exit_failure); }//Initialize the host input vectors for (int i = 0; i < numelements; ++i) {H_a[i] = rand ()/(float) R        And_max;    H_b[i] = rand ()/(float) Rand_max;    }//Allocate the device input vector A float *d_a = NULL;    Err = Cudamalloc ((void * *) &d_a, size); if (err! = cudasuccess) {fprintf (stderr, "Failed to allocate device Vector A (error code%s)!\n ", cudageterrorstring (err));    Exit (Exit_failure);    }//Allocate the device input vector B float *d_b = NULL;    Err = Cudamalloc ((void * *) &d_b, size); if (err! = cudasuccess) {fprintf (stderr, "Failed to allocate device vector B (error code%s)!\n", cudageterrors        Tring (err));    Exit (Exit_failure);    }//Allocate the device output vector C float *d_c = NULL;    Err = Cudamalloc ((void * *) &d_c, size); if (err! = cudasuccess) {fprintf (stderr, "Failed to allocate device vector C (error code%s)!\n", cudageterrors        Tring (err));    Exit (Exit_failure); }//Copy the host input vectors A and B in host memory to the device input vectors in//device memory printf ("C    opy input data from the host memory to the CUDA device\n ");    Err = cudamemcpy (d_a, h_a, size, cudamemcpyhosttodevice); if (err! = cudasuccess) {fprintf (stderr, "Failed to copy vector A from the host to device (error code%s)!\ n ", cudageterrorstring (err));    Exit (Exit_failure);    } err = cudamemcpy (d_b, h_b, size, cudamemcpyhosttodevice); if (err! = cudasuccess) {fprintf (stderr, "Failed to copy vectors B from the host to device (error code%s)!\n", Cuda        GetErrorString (err));    Exit (Exit_failure);    }//Launch the Vector Add CUDA Kernel int threadsperblock = 256; int Blockspergrid = (numelements + threadsPerBlock-1)/threadsperblock;//According to data size allocation block processing mode printf ("CUDA kernel Launc    H with%d blocks of%d threads\n ", Blockspergrid, Threadsperblock);    Vectoradd<<<blockspergrid, Threadsperblock>>> (d_a, D_b, D_c, numelements);    Err = Cudagetlasterror (); if (err! = cudasuccess) {fprintf (stderr, "Failed to launch Vectoradd kernel (error code%s)!\n", Cudageterrorst        Ring (ERR));    Exit (Exit_failure);    }//Copy The device result vector in device memory to the host result vector//in host memory. printf ("Copy output data from the CUDA device to the host memory\n ");    Err = cudamemcpy (H_c, D_c, size, cudamemcpydevicetohost); if (err! = cudasuccess) {fprintf (stderr, "Failed to copy vector C from device to host (error code%s)!\n", Cuda        GetErrorString (err));    Exit (Exit_failure);  }//Verify that the result vector was correct for (int i = 0; i < numelements; ++i) {if (Fabs (h_a[i) +            H_b[i]-h_c[i]) > 1e-5) {fprintf (stderr, "Result verification failed at Element%d!\n", i);        Exit (Exit_failure);    }} printf ("Test passed\n");    Free device global Memory Err = Cudafree (d_a); if (err! = cudasuccess) {fprintf (stderr, "Failed to free device vector A (error code%s)!\n", Cudageterrorstrin        g (Err));    Exit (Exit_failure);    } err = Cudafree (d_b); if (err! = cudasuccess) {fprintf (stderr, "Failed to free device vector B (error code%s)!\n", Cudageterrorstrin        g (Err));    Exit (Exit_failure); } ERR = Cudafree (D_c); if (err! = cudasuccess) {fprintf (stderr, "Failed to free device vector C (error code%s)!\n", Cudageterrorstrin        g (Err));    Exit (Exit_failure);    }//Free host memory free (H_A);    Free (h_b);    Free (H_c); Reset the device and exit//Cudadevicereset causes the driver to clean up all state.  While//isn't mandatory in normal operation, it's is good practice. It was also//needed to ensure correct operation when the application was being//profiled. Calling Cudadevicereset causes all profiles data to be/flushed before the application exits Err = Cudadevicereset ( );//focus on the record and the introduction of a place, before doing muti-gpu in the time is free to lose memory after it, later found this function, very good habit! Remember the first time to write the CUDA program parallel image Pyramid, did not write to, the results can always be seen before the GPU processed images, let me baffled its solution, haha//For beginners, just started to write parallel programs will encounter a lot of inexplicable problems ... if (err! = cudasuccess) {fprintf (stderr, "Failed to deinitialize the device!        Error=%s\n ", cudageterrorstring (err));    Exit (Exit_failure);    } printf ("done\n"); return 0;}


Copyright NOTICE: This article for Bo Master original article, without Bo Master permission not reproduced.

Cuda Learning Note Two

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.