In addition to writing Cuda code directly in a project using CU or Cuh, you can place the Cuda related action code in a DLL project, compile the project into a dynamic-link library dll, and then refer to the DLL in the project you want to use and call its internal functions.
Now create a new DLL project with the project name Test00302, as shown in the following illustration:
Now create a new file named Test.cu in your project, as shown in the following illustration:
Then set the build customization for the project as shown in the following illustration:
Then set the "item type" in the Test.cu's properties as "CUDA + +", as shown in the following illustration:
Finally, add cudart.lib to the linker's additional dependencies in the project properties, as shown in the following illustration:
Add the following code to the TEST.CU:
#include <stdio.h> #include <iostream> #include <cuda_runtime.h>//cuda run-time Library header file using namespace std; extern "C" {#ifdef test00302_exports #define TEST00302_API __declspec (dllexport) #else #define TEST00302_API __DECLSP EC (dllimport) #endif test00302_api void Showdeviceprop (void);
function to export}//display device information void Showdeviceprop (void) {int i,count;
Cudadeviceprop prop;
cudaerror_t Cudastatus=cudagetdevicecount (&count);
if (Cudastatus = = cudasuccess) {cout<< "total number of units:" <<count<< "\ n"; if (count>0) {for (i=0;i<count;i++) {cudagetdeviceproperties (&prop,i);//Get device property information cout<< "\
n the "<<i+1<<" equipment information: \ n ";
cout<< "Device Name:" <<prop.name<< "\ n";
cout<< "Total Memory:" <<prop.totalGlobalMem/1048576<< "m\n";
cout<< "Constant Memory:" <<prop.totalConstMem<< "bytes \ n";
cout<< "Number of processors in the device:" <<prop.multiProcessorCount<< "\ n"; cout<< "Each thread block contains a maximum number of threads:" <<prop.maxthreadsperblock<< "a \ n"; cout<< "Number of thread blocks that can be contained in a single thread: i=" << prop.maxgridsize[0] << "j=" <<prop.maxGridSize[1]<< "k="
<<prop.maxGridSize[2]<< "\ n"; cout<< the maximum number of threads that can be contained in a multidimensional thread block: i= "<< prop.maxthreadsdim[0] <<" j= "<<prop.maxThreadsDim[1]<
< "k=" <<prop.maxThreadsDim[2]<< "\ n"; The device information was not acquired by}} else {cout<<. Check to see if your computer has a graphics device that supports CUDA and whether the Cuda driver version needs to be updated.
\ n ";
}
}
It is worth noting that the previous extern "C" code, which is used to export the CUDA function defined in the CU file. The project is compiled and the Test00302.dll file is generated in the debug file, as shown in the following illustration:
Now create a new VS project with the project name Test00303, add code to the Test00303.cpp file that dynamically references the DLL, and execute the Showdeviceprop () function in the DLL, for the console application, as follows:
Test00303.cpp: Defines the entry point for a console application.
//
#include "stdafx.h"
#include <stdio.h>
#include <stdlib.h>
#include < Windows.h>
//dynamic load link Test
typedef void (*DLLFUNC) (void);//Declare function pointer int _tmain for function prototypes that need to be called from a DLL
(int argc, _ tchar* argv[])
{
hinstance mycudadll=loadlibrary (__t ("Test00302.dll"));//Dynamically load DLL file
if (Mycudadll)
{
dllfunc dllfun= (dllfunc) GetProcAddress (Mycudadll, "Showdeviceprop");//Get function pointer
if (dllfun)
{
Dllfun ();//execute function
}
else
{
printf ("No such function exists in DLL");//possibly due to a function name error
}
freelibrary (Mycudadll);//Uninstall DLL file dynamically
}
Else
{
printf ("Load DLL failed!");
}
System ("pause");
return 0;
}
Then copy the dynamic-link library files generated in the project Test00302 Test00302.dll to the Debug folder in Project Test00303, as shown in the following illustration:
Run the project Test00303, and the results are as shown in the following illustration:
Project Test00303 is a common console application, Test00302.dll for the compiled dynamic link library containing the Cuda function, through dynamic reference, can load the CUDA program inside the ordinary application.
To test with the call Cuda kernel function, first create a new file named Test2.cu in the project Test00302, add the kernel function Addkernel () in the GPU to the TEST2.CU, and add the function for vector addition Vectoradd () , in function Vectoradd (), select the GPU device for execution, allocate memory on the device, replicate host memory data to device memory, start kernel function, call cudadevicesynchronize () to listen for kernel function execution, copy device memory data to host memory, Resets the CUDA device and frees the device memory. The code in TEST2.CU is as follows:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> extern "C" {#ifdef test00302_ Exports #define TEST00302_API __declspec (dllexport) #else #define TEST00302_API __declspec (dllimport) #endif TEST003
02_API int Vectoradd (int c[], int a[], int b[],int size);
}//cuda kernel function __global__ void addkernel (int *c, const int *a, const int *b) {int i = threadidx.x;
C[i] = A[i] + b[i];
}//vector addition int vectoradd (int c[], int a[], int b[],int size) {int result=-1;
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaerror_t Cudastatus;
Select the GPU Cudastatus = cudasetdevice (0) for running;
if (cudastatus!= cudasuccess) {result=1;
Goto Error;
////In the GPU allocate memory space for variable dev_a, Dev_b, Dev_c.
Cudastatus = Cudamalloc (void**) &dev_c, size * sizeof (int));
if (cudastatus!= cudasuccess) {result=2;
Goto Error;
} cudastatus = Cudamalloc (void**) &dev_a, size * sizeof (int)); if (Cudastatus!= Cudasuccess) {result=3;
Goto Error;
} cudastatus = Cudamalloc (void**) &dev_b, size * sizeof (int));
if (cudastatus!= cudasuccess) {result=4;
Goto Error;
//Copy data from host memory to GPU memory.
Cudastatus = cudamemcpy (Dev_a, a, size * sizeof (int), cudamemcpyhosttodevice);
if (cudastatus!= cudasuccess) {result=5;
Goto Error;
} cudastatus = cudamemcpy (Dev_b, b, size * sizeof (int), cudamemcpyhosttodevice);
if (cudastatus!= cudasuccess) {result=6;
Goto Error;
//start GPU kernel function addkernel<<<1, size>>> (Dev_c, dev_a, Dev_b);
Use Cudadevicesynchronize to wait for the GPU kernel function to execute and return any error messages encountered Cudastatus = Cudadevicesynchronize ();
if (cudastatus!= cudasuccess) {result=7;
Goto Error;
///Copy data from GPU memory to host memory Cudastatus = cudamemcpy (c, dev_c, size * sizeof (int), cudamemcpydevicetohost);
if (cudastatus!= cudasuccess) {result=8;
Goto Error;
} result=0; Reset CudA device, you must call Cudadevicereset Cudastatus = Cudadevicereset () before exiting;
if (cudastatus!= cudasuccess) {return 9;
Error://Release the Memory Cudafree (dev_c) of the variable in the device;
Cudafree (dev_a);
Cudafree (Dev_b);
return result;
}
Compile the project Test00302, copy the generated dynamic link library file Test00302.dll to the Debug folder in Project Test00303, and then modify the file Test00303.cpp contents in the project Test00303 as follows:
Test00303.cpp: Defines the entry point for a console application. #include "stdafx.h" #include <stdio.h> #include <stdlib.h> #include <Windows.h> typedef int (*DLLFUNC) (void)//declares a function pointer typedef int (*DLLFUNC2) (int *c, int *a, int *b,int size) that needs to be called from a DLL;//declares a function pointer int for a function prototype that needs to be called from a DLL _tmai n (int argc, _tchar* argv[]) {hinstance mycudadll=loadlibrary (__t ("Test00302.dll"))//Dynamically load DLL file if (Mycudadll) {//Tune Displays device property information with the Showdeviceprop function in the DLL//dllfunc dllfun= (dllfunc) GetProcAddress (Mycudadll, "Showdeviceprop");/Get function pointer//if (dllfun)//{//Dllfun ()///Execute function//}//else//{//printf ("No such function exists in DLL");//possibly due to function name error/////Call to Vectorad in DLL d function DLLFUNC2 dllfun2= (DLLFUNC2) GetProcAddress (Mycudadll, "Vectoradd");//Get function pointer if (DLLFUN2) {const int arraysize
= 5;
int A[arraysize] = {1, 2, 3, 4, 5};
int B[arraysize] = {10, 20, 30, 40, 50};
int C[arraysize] = {0};
int r=dllfun2 (c,a,b,arraysize);//Execute function if (r==0) {printf ("compute success \ n"); printf ("{1,2,3,4,5} + {10,20, 30,40,50} = {%d,%d,%d,%d,%d}\n ", c[0], c[1], c[2], c[3], c[4]);
else printf ("Compute failed \ n"); else {printf ("does not exist in DLL");//possibly due to a function name error} freelibrary (Mycudadll);//Uninstall DLL file dynamically} else {printf ("Failed to load DLL!").
");
System ("pause");
return 0;
}
Because the function vectoradd in the DLL is defined as follows:
int vectoradd (int c[], int a[], int b[],int size);
So when declaring a function's prototype pointer, you also need to define it as:
typedef int (*DLLFUNC2) (int *c, int *a, int *b,int size);
Or:
typedef int (*DLLFUNC2) (int c[], int a[], int b[],int size);
Run the project Test00303, and the results are as shown in the following illustration:
Although this example is relatively simple, the computation of the vector addition is done by the GPU.