vs2015+cuda8.0 Environment Configuration
Anyway, record the correct configuration here:
1, first, the officer network download corresponding vs version of Cuda Toolkit:
Https://developer.nvidia.com/cuda-toolkit-50-archive
(Remember vs2010 corresponds to cuda5.0,vs2013 corresponds to cuda7.5,vs2015 corresponding to CUDA8.0)
2, then, the direct installation, remember in the installation process if you do not want to change your original graphics card driver, choose Custom does not install driver; otherwise, if you directly select "Thin" and do not install the driver, the CUDA installation will not succeed.
3. After the installation is complete, you can see several folders after entering the C:\Program files\nvidia GPU Computing toolkit\cuda\v8.0: Bin, lib, include, etc., which indicates that the installation was successful.
4. Next, look at how to create a project that uses CUDA programming, and when you open vs Create a project, you can see the new project type:
But we're here to teach you how to compile a CU file in an empty project, so we create a VC + + Empty project, and then create a new CPP file and a CU file.
The test.cpp code is as follows:
#include <time.h> #include <stdlib.h> #include <stdio.h>//Don't forget to add a reference statement extern "C" void matrixmultiplic
Ation_cuda (const float* M, const float* N, float* P, int Width);
Constructor ...//destructor ...//generate matrix, matrix element 0~1 void Matgen (float* a, int Width) {int I, J; for (i = 0, i < width; i++) {for (j = 0; J < Width; J + +) {A[i * width + j] = (Floa
T) rand ()/Rand_max + (float) rand ()/(Rand_max*rand_max);
}}}//matrix multiplication (CPU validation) void Matrixmultiplication (const float* M, const float* N, float* P, int Width) {int I, j, K;
for (i = 0, i < width; i++) {for (j = 0; J < Width; J + +) {float sum = 0;
for (k = 0; k < width; k++) {sum + = M[i * width + K] * n[k * width + j];
} p[i * Width + j] = sum;
}}} double Matrixmul_gpu () {float *m, *n, *PG; int Width = 1024; 1024x1024 matrix multiplication M = (float*) MalloC (sizeof (FLOAT) * Width * width);
N = (float*) malloc (sizeof (FLOAT) * Width * width); Pg = (float*) malloc (sizeof (FLOAT) * Width * width);
Save GPU Calculation Results srand (0); Matgen (M, Width); Produces matrix M matgen (N, Width); Produces a matrix n double timestart, timeend;
Define time, to find the difference with Timestart = clock (); Matrixmultiplication_cuda (M, N, Pg, Width);
Compute timeend on GPU = Clock ();
Free (M);
Free (N);
Free (PG);
return timeend-timestart;
} double Matrixmul_cpu () {float *m, *n, *pc; int Width = 1024;
1024x1024 matrix multiplication M = (float*) malloc (sizeof (FLOAT) * Width * width);
N = (float*) malloc (sizeof (FLOAT) * Width * width); Pc = (float*) malloc (sizeof (FLOAT) * Width * width);
Save CPU Calculation Results Srand (0); Matgen (M, Width); Produces matrix M matgen (N, Width); Produces a matrix n double timestart, timeend;
Define time, to find the difference with Timestart = clock (); Matrixmultiplication (M, N, Pc, Width);
CPU on COMPUTE timeend = Clock ();
Free (M);
Free (N);
Free (PC); RetuRN Timeend-timestart; }//////////////////////////////////////////////////////////////////////////int main () {printf ("CPU Use time%g\n",
MATRIXMUL_CPU ());
printf ("GPU Use time%g\n", Matrixmul_gpu ());
System ("pause");
return 0;
}
The TEST.CU code is as follows:
Cudatest.cu #include "cuda_runtime.h" #include "device_launch_parameters.h" #define TILE_WIDTH 16//Kernel function//__glo bal__ static void Matrixmulkernel (const float* md,const float* nd,float* pd,int Width) __global__ void Matrixmulkernel (con St float* Md, const float* Nd, float* PD, int Width) {//The row index of the elements in the Pd and Md are calculated int row = blockidx.y * tile_width + thread idx.y; line int Col = blockidx.x * tile_width + threadidx.x;
column float Pvalue = 0.0;
for (int k = 0; k < Width; k++) {Pvalue + = Md[row * width + K] * nd[k * width + Col];
}//Each thread is responsible for calculating an element in P Pd[row * Width + Col] = Pvalue; }//Matrix multiplication (CUDA)//called externally, using extern extern "C" void Matrixmultiplication_cuda (const float* M, const float* N, float* P, in T Width) {cudasetdevice (0);
Set target GPU float *MD, *nd, *PD;
int size = width * Width * sizeof (float);//Byte length Cudamalloc ((void**) &md, size);
Cudamalloc ((void**) &nd, size);
Cudamalloc ((void**) &pd, size); COpies a matrix from the memory* area pointed to by SRC to the memory area pointed to by DST cudamemcpy (Md, M, size, cu
Damemcpyhosttodevice);
cudamemcpy (Nd, N, size, cudamemcpyhosttodevice); Dim3 Dimgrid (Width/tile_width, width/tile_width); The dimension of the grid dim3 Dimblock (Tile_width, tile_width);
Dimensions of the block Matrixmulkernel <<< Dimgrid, Dimblock >>> (Md, Nd, Pd, Width);
cudamemcpy (P, Pd, size, cudamemcpydevicetohost);
Release The Matrix Cudafree (Md) on the device;
Cudafree (ND);
Cudafree (PD); }
Next is the third-party library link, first of all, you have to right-click the project, open the project properties
Enter in the executable directory separately: C:\Program files\nvidia GPU Computing toolkit\cuda\v8.0\bin
Enter in the Include directory: C:\Program files\nvidia GPU Computing toolkit\cuda\v8.0\include
Enter in the Library directory: C:\Program files\nvidia GPU Computing toolkit\cuda\v8.0\lib\win32
Then enter in the linker/input/Additional dependencies: C:\Program files\nvidia GPU Computing The file name of all lib files in the Toolkit\cuda\v8.0\lib\win32 directory
At this point, if you are in a hurry to compile, you will find an error: roughly meaning that the extern modified function is applied, unresolved external commands
Because the compiler does not compile the Cu file at this point, the CPP file cannot reference the functions in the CU file.
The most critical step is to:
Right-click the project, click Build Dependencies, select Build custom, and then tick Cuda
then right-click the Test.cu file to open the properties and modify the "project type" as follows:
It's done, happy commissioning.