Cuda Learning Note Two

Last Update:2015-08-02 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

The simple vector Plus

/** * Vector addition:c = a + B. * * This sample was A very basic sample that implements element by element * Vector Addit Ion. It is the same as the sample illustrating Chapter 2 * of the Programming Guide with some additions like error checking. */#include <stdio.h>//for the CUDA runtime routines (prefixed with "Cuda_") #include <cuda_runtime.h>/** * CUD A Kernel Device code * * Computes the vector addition of a and B into C. The 3 vectors has the same * number of elements numelements. */__global__ voidvectoradd (const float *a, const float *b, float *c, int numelements) {int i = blockdim.x * blockidx.x + threadidx.x;//COMPUTE thread index//printf ("Thread index:%d\n", i),//cuda 2.0 above support kernel function printing, of course, when the number of threads is much easier to cause some questions <span style= " White-space:pre "></span>//questions.    So I usually debug the number of threads in the entire grid to 1, and then Debug.    if (I < numelements) {C[i] = A[i] + b[i]; }}/** * Host main routine */intmain (void) {//Error code to check return values for CUDA calls cudaerror_t err = cud Asuccess;    Print the vector length to being used, and compute its size int numelements = 50000;    size_t size = numelements * sizeof (float);    printf ("[Vector addition of%d elements]\n", numelements);    Allocate the host input vector A float *h_a = (float *) malloc (size);    Allocate the host input vector B float *h_b = (float *) malloc (size);    Allocate the host output vector C float *h_c = (float *) malloc (size);  Verify that allocations succeeded if (h_a = = NULL | | h_b = NULL | | h_c = = NULL) {fprintf (stderr, "Failed        To allocate host vectors!\n ");    Exit (Exit_failure); }//Initialize the host input vectors for (int i = 0; i < numelements; ++i) {H_a[i] = rand ()/(float) R        And_max;    H_b[i] = rand ()/(float) Rand_max;    }//Allocate the device input vector A float *d_a = NULL;    Err = Cudamalloc ((void * *) &d_a, size); if (err! = cudasuccess) {fprintf (stderr, "Failed to allocate device Vector A (error code%s)!\n ", cudageterrorstring (err));    Exit (Exit_failure);    }//Allocate the device input vector B float *d_b = NULL;    Err = Cudamalloc ((void * *) &d_b, size); if (err! = cudasuccess) {fprintf (stderr, "Failed to allocate device vector B (error code%s)!\n", cudageterrors        Tring (err));    Exit (Exit_failure);    }//Allocate the device output vector C float *d_c = NULL;    Err = Cudamalloc ((void * *) &d_c, size); if (err! = cudasuccess) {fprintf (stderr, "Failed to allocate device vector C (error code%s)!\n", cudageterrors        Tring (err));    Exit (Exit_failure); }//Copy the host input vectors A and B in host memory to the device input vectors in//device memory printf ("C    opy input data from the host memory to the CUDA device\n ");    Err = cudamemcpy (d_a, h_a, size, cudamemcpyhosttodevice); if (err! = cudasuccess) {fprintf (stderr, "Failed to copy vector A from the host to device (error code%s)!\ n ", cudageterrorstring (err));    Exit (Exit_failure);    } err = cudamemcpy (d_b, h_b, size, cudamemcpyhosttodevice); if (err! = cudasuccess) {fprintf (stderr, "Failed to copy vectors B from the host to device (error code%s)!\n", Cuda        GetErrorString (err));    Exit (Exit_failure);    }//Launch the Vector Add CUDA Kernel int threadsperblock = 256; int Blockspergrid = (numelements + threadsPerBlock-1)/threadsperblock;//According to data size allocation block processing mode printf ("CUDA kernel Launc    H with%d blocks of%d threads\n ", Blockspergrid, Threadsperblock);    Vectoradd<<<blockspergrid, Threadsperblock>>> (d_a, D_b, D_c, numelements);    Err = Cudagetlasterror (); if (err! = cudasuccess) {fprintf (stderr, "Failed to launch Vectoradd kernel (error code%s)!\n", Cudageterrorst        Ring (ERR));    Exit (Exit_failure);    }//Copy The device result vector in device memory to the host result vector//in host memory. printf ("Copy output data from the CUDA device to the host memory\n ");    Err = cudamemcpy (H_c, D_c, size, cudamemcpydevicetohost); if (err! = cudasuccess) {fprintf (stderr, "Failed to copy vector C from device to host (error code%s)!\n", Cuda        GetErrorString (err));    Exit (Exit_failure);  }//Verify that the result vector was correct for (int i = 0; i < numelements; ++i) {if (Fabs (h_a[i) +            H_b[i]-h_c[i]) > 1e-5) {fprintf (stderr, "Result verification failed at Element%d!\n", i);        Exit (Exit_failure);    }} printf ("Test passed\n");    Free device global Memory Err = Cudafree (d_a); if (err! = cudasuccess) {fprintf (stderr, "Failed to free device vector A (error code%s)!\n", Cudageterrorstrin        g (Err));    Exit (Exit_failure);    } err = Cudafree (d_b); if (err! = cudasuccess) {fprintf (stderr, "Failed to free device vector B (error code%s)!\n", Cudageterrorstrin        g (Err));    Exit (Exit_failure); } ERR = Cudafree (D_c); if (err! = cudasuccess) {fprintf (stderr, "Failed to free device vector C (error code%s)!\n", Cudageterrorstrin        g (Err));    Exit (Exit_failure);    }//Free host memory free (H_A);    Free (h_b);    Free (H_c); Reset the device and exit//Cudadevicereset causes the driver to clean up all state.  While//isn't mandatory in normal operation, it's is good practice. It was also//needed to ensure correct operation when the application was being//profiled. Calling Cudadevicereset causes all profiles data to be/flushed before the application exits Err = Cudadevicereset ( );//focus on the record and the introduction of a place, before doing muti-gpu in the time is free to lose memory after it, later found this function, very good habit! Remember the first time to write the CUDA program parallel image Pyramid, did not write to, the results can always be seen before the GPU processed images, let me baffled its solution, haha//For beginners, just started to write parallel programs will encounter a lot of inexplicable problems ... if (err! = cudasuccess) {fprintf (stderr, "Failed to deinitialize the device!        Error=%s\n ", cudageterrorstring (err));    Exit (Exit_failure);    } printf ("done\n"); return 0;}

Cuda Learning Note Two

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Cuda Learning Note Two

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Cuda Learning Note Two

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support