First build your own project, ordinary C + + project
Write the C + + program you want to run
Check a cuda option, because I installed two, casually select one can
Select Properties
Of course, if the new time is based on cudac++ then there is no need to choose this step.
Then open the properties of the project and add Cudart.lib to the attachment dependencies in the configuration Properties-linker-Input
And then you can write your own code.
The code in GPUCPUTEST.CU is as follows:
#include <cstdio>
#include <iostream>
#include <cuda_runtime.h>
#include <cutil.h>
#include <cuda.h>
using namespace Std;
BOOL Initcuda (void)
{
int count = 0;
int i = 0;
Cudagetdevicecount (&count);
if (count = = 0) {
fprintf (stderr, "There is no device.\n");
return false;
}
for (i = 0; i < count; i++) {
Cudadeviceprop prop;
if (Cudagetdeviceproperties (&prop, i) = = cudasuccess) {
if (prop.major >= 1) {
Break
}
}
}
if (i = = count) {
fprintf (stderr, "There is no device supporting cuda.\n");
return false;
}
Cudasetdevice (i);
printf ("CUDA initialized.\n");
Cudadeviceprop prop;
Cudagetdeviceproperties (&prop,i);
printf ("Device: \"%s \ \ n \ prop.name);
return true;
}
#define AW 855
#define AH 511
#define BW 1013
#define Blocknum 32//32
#define Threadnum 256//256
#define AW 2000
#define AH 2000
#define BW 2000
#define Blocknum 445//32
#define Threadnum 445//256
typedef struct
{
int width;
int height;
int *element;
}matrix;
Matrix Initmatrix (int w, int h)
{
Matrix T;
t.element= (int *) malloc (w * h * sizeof (int));
for (int i=0 i < w*h i + +)
t.element[i]= rand ()% 10;
T.width=w;
T.height=h;
return t;
}
Matrix MM (Matrix A, matrix B)
{
Matrix T;
t.element= (int *) malloc (a.height * b.width * sizeof (int));
T.width=b.width;
T.height=a.height;
int x;
int y;
for (int i =0 i < t.width * T.height; i + +)
{
X=i/t.width * A.WIDTH;
Y=i-i/t.width * t.width;
t.element[i]=0;
for (int k = 0; k < A. width; k + +)
{
T.element[i] + = a.element[x + K] * b.element [y +b.width * k];
}
}
return t;
}
__global__ static void Matrixmul (int *ma, int *mb, int *mc, int *mp)
{
int aw = mp[0];
int bw = mp[2];
int cw = mp[4];
int ch = mp[5];
const int bid = blockidx.x;
const int TID = threadidx.x;
int i, x, y;
for (i = bid * Threadnum + TID i < CW * CH; i + = Threadnum * blocknum)
{
x = I/CW * AW;
y = i-i/cw * CW;
Mc[i] = 0;
for (int k = 0; k < aw; k + +)
{
Mc[i] + = ma[x + K] * mb[y + k * bw];
}
}
}
extern "C" void Gpucputest ()
{
if (! Initcuda ()) {
Return
}
Defining matrices
int matrixa[n][n], matrixb[n][n], matrixc[n][n], gpuresult[n][n], matrixd[n][n];
Matrix Matrixa=initmatrix (Aw,ah);
Matrix Matrixb=initmatrix (BW,AW);
Matrix Matrixc;
Matrix Gpuresult=initmatrix (Bw,ah);
int matrixprop[6];
Timing for CPU operations
unsigned int timer1 = 0;
CPU matrix multiplication
int start = clock ();
MATRIXC=MM (MATRIXA,MATRIXB);
int finish = clock ();
printf ("CPU time =%d\n", Finish-start);
start = Clock ();
Matrixprop[0] = matrixa.width;
MATRIXPROP[1] = matrixa.height;
MATRIXPROP[2] = matrixb.width;
MATRIXPROP[3] = matrixb.height;
MATRIXPROP[4] = matrixc.width;
MATRIXPROP[5] = matrixc.height;
Request Video Memory
int *ma, *MB, *MC, *MP;
Cudamalloc ((void**) &ma, sizeof (int) * matrixa.width * matrixa.height);
Cudamalloc ((void**) &MB, sizeof (int) * matrixb.width * matrixb.height);
Cudamalloc ((void**) &MC, sizeof (int) * matrixc.width * matrixc.height);
Cudamalloc ((void**) &mp, sizeof (int) * 6);
Copy data to Memory
cudamemcpy (MA, matrixa.element, sizeof (int) * Matrixa.width * matrixa.height, Cudamemcpyhosttodevice);
cudamemcpy (MB, matrixb.element, sizeof (int) * Matrixb.width * matrixb.height, Cudamemcpyhosttodevice);
cudamemcpy (MP, Matrixprop, sizeof (int) * 6, Cudamemcpyhosttodevice);
unsigned int timer2 = 0;
Calling the Cuda function
Matrixmul <<< blocknum, threadnum, 0 >>> (MA, MB, MC, MP);
Cudathreadsynchronize ();
Cutilcheckerror (Cutstoptimer (timer2));
Copy data out of memory
cudamemcpy (Gpuresult.element, MC, sizeof (int) * Gpuresult.width * gpuresult.height, cudamemcpydevicetohost);
finish = Clock ();
printf ("GPU time =%d\n", Finish-start);
for (int i =0 i < gpuresult.width * Gpuresult.height; i + +)
{
printf ("%d--%d\n", matrixc.element[i],gpuresult.element[i));
if (Matrixc.element[i]!= gpuresult.element[i])
{
printf ("ERROR");
}
}
Cudafree (MA);
Cudafree (MB);
Cudafree (MC);
Cudafree (MP);
System ("pause");
}
When calling in a CPP program, notice to add the extern "C" where C caps
You can then run (call the function defined in the CU or Cuh in a CPP or H file and invoke it without using the Include method, in an extern way to declare the CUDA function defined in a CU or Cuh file as a public function, declaring an extern function in CPP or H, Then you can use it directly.
The above program runs as follows:
The writing is more coarse, mainly to leave a mark.