C + + mixed implementation of C + + GPU program in VS Cuda

Source: Internet
Author: User

First build your own project, ordinary C + + project

Write the C + + program you want to run



Check a cuda option, because I installed two, casually select one can

Select Properties

Of course, if the new time is based on cudac++ then there is no need to choose this step.

Then open the properties of the project and add Cudart.lib to the attachment dependencies in the configuration Properties-linker-Input


And then you can write your own code.

The code in GPUCPUTEST.CU is as follows:

#include <cstdio>
#include <iostream>
#include <cuda_runtime.h>
#include <cutil.h>
#include <cuda.h>
using namespace Std;
BOOL Initcuda (void)
{
int count = 0;
int i = 0;
Cudagetdevicecount (&count);
if (count = = 0) {
fprintf (stderr, "There is no device.\n");
return false;
}
for (i = 0; i < count; i++) {
Cudadeviceprop prop;
if (Cudagetdeviceproperties (&prop, i) = = cudasuccess) {
if (prop.major >= 1) {
Break
}
}
}
if (i = = count) {
fprintf (stderr, "There is no device supporting cuda.\n");
return false;
}
Cudasetdevice (i);
printf ("CUDA initialized.\n");
Cudadeviceprop prop;
Cudagetdeviceproperties (&prop,i);
printf ("Device: \"%s \ \ n \ prop.name);
return true;
}


#define AW 855
#define AH 511
#define BW 1013
#define Blocknum 32//32
#define Threadnum 256//256


#define AW 2000
#define AH 2000
#define BW 2000
#define Blocknum 445//32
#define Threadnum 445//256


typedef struct
{
int width;
int height;
int *element;
}matrix;
Matrix Initmatrix (int w, int h)
{
Matrix T;
t.element= (int *) malloc (w * h * sizeof (int));
for (int i=0 i < w*h i + +)
t.element[i]= rand ()% 10;
T.width=w;
T.height=h;
return t;
}
Matrix MM (Matrix A, matrix B)
{
Matrix T;
t.element= (int *) malloc (a.height * b.width * sizeof (int));
T.width=b.width;
T.height=a.height;
int x;
int y;
for (int i =0 i < t.width * T.height; i + +)
{
X=i/t.width * A.WIDTH;
Y=i-i/t.width * t.width;
t.element[i]=0;
for (int k = 0; k < A. width; k + +)
{
T.element[i] + = a.element[x + K] * b.element [y +b.width * k];
}
}
return t;
}







__global__ static void Matrixmul (int *ma, int *mb, int *mc, int *mp)
{
int aw = mp[0];
int bw = mp[2];
int cw = mp[4];
int ch = mp[5];
const int bid = blockidx.x;
const int TID = threadidx.x;
int i, x, y;

for (i = bid * Threadnum + TID i < CW * CH; i + = Threadnum * blocknum)
{
x = I/CW * AW;
y = i-i/cw * CW;
Mc[i] = 0;
for (int k = 0; k < aw; k + +)
{
Mc[i] + = ma[x + K] * mb[y + k * bw];
}
}
}
extern "C" void Gpucputest ()
{
if (! Initcuda ()) {
Return
}
Defining matrices
int matrixa[n][n], matrixb[n][n], matrixc[n][n], gpuresult[n][n], matrixd[n][n];
Matrix Matrixa=initmatrix (Aw,ah);
Matrix Matrixb=initmatrix (BW,AW);
Matrix Matrixc;
Matrix Gpuresult=initmatrix (Bw,ah);

int matrixprop[6];

Timing for CPU operations
unsigned int timer1 = 0;
CPU matrix multiplication
int start = clock ();
MATRIXC=MM (MATRIXA,MATRIXB);
int finish = clock ();
printf ("CPU time =%d\n", Finish-start);
start = Clock ();
Matrixprop[0] = matrixa.width;
MATRIXPROP[1] = matrixa.height;
MATRIXPROP[2] = matrixb.width;
MATRIXPROP[3] = matrixb.height;
MATRIXPROP[4] = matrixc.width;
MATRIXPROP[5] = matrixc.height;

Request Video Memory
int *ma, *MB, *MC, *MP;
Cudamalloc ((void**) &ma, sizeof (int) * matrixa.width * matrixa.height);
Cudamalloc ((void**) &AMP;MB, sizeof (int) * matrixb.width * matrixb.height);
Cudamalloc ((void**) &AMP;MC, sizeof (int) * matrixc.width * matrixc.height);
Cudamalloc ((void**) &mp, sizeof (int) * 6);
Copy data to Memory
cudamemcpy (MA, matrixa.element, sizeof (int) * Matrixa.width * matrixa.height, Cudamemcpyhosttodevice);
cudamemcpy (MB, matrixb.element, sizeof (int) * Matrixb.width * matrixb.height, Cudamemcpyhosttodevice);
cudamemcpy (MP, Matrixprop, sizeof (int) * 6, Cudamemcpyhosttodevice);
unsigned int timer2 = 0;
Calling the Cuda function
Matrixmul <<< blocknum, threadnum, 0 >>> (MA, MB, MC, MP);
Cudathreadsynchronize ();
Cutilcheckerror (Cutstoptimer (timer2));
Copy data out of memory
cudamemcpy (Gpuresult.element, MC, sizeof (int) * Gpuresult.width * gpuresult.height, cudamemcpydevicetohost);
finish = Clock ();
printf ("GPU time =%d\n", Finish-start);


for (int i =0 i < gpuresult.width * Gpuresult.height; i + +)
{
printf ("%d--%d\n", matrixc.element[i],gpuresult.element[i));
if (Matrixc.element[i]!= gpuresult.element[i])
{
printf ("ERROR");
}
}


Cudafree (MA);
Cudafree (MB);
Cudafree (MC);
Cudafree (MP);
System ("pause");
}


When calling in a CPP program, notice to add the extern "C" where C caps

You can then run (call the function defined in the CU or Cuh in a CPP or H file and invoke it without using the Include method, in an extern way to declare the CUDA function defined in a CU or Cuh file as a public function, declaring an extern function in CPP or H, Then you can use it directly.

The above program runs as follows:



The writing is more coarse, mainly to leave a mark.


Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.