C + + mixed implementation of C + + GPU program in VS Cuda

Last Update:2018-07-25 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

First build your own project, ordinary C + + project

Write the C + + program you want to run

Check a cuda option, because I installed two, casually select one can

Select Properties

Of course, if the new time is based on cudac++ then there is no need to choose this step.

Then open the properties of the project and add Cudart.lib to the attachment dependencies in the configuration Properties-linker-Input

And then you can write your own code.

The code in GPUCPUTEST.CU is as follows:

#include <cstdio>
#include <iostream>
#include <cuda_runtime.h>
#include <cutil.h>
#include <cuda.h>
using namespace Std;
BOOL Initcuda (void)
{
int count = 0;
int i = 0;
Cudagetdevicecount (&count);
if (count = = 0) {
fprintf (stderr, "There is no device.\n");
return false;
}
for (i = 0; i < count; i++) {
Cudadeviceprop prop;
if (Cudagetdeviceproperties (&prop, i) = = cudasuccess) {
if (prop.major >= 1) {
Break
}
}
}
if (i = = count) {
fprintf (stderr, "There is no device supporting cuda.\n");
return false;
}
Cudasetdevice (i);
printf ("CUDA initialized.\n");
Cudadeviceprop prop;
Cudagetdeviceproperties (&prop,i);
printf ("Device: \"%s \ \ n \ prop.name);
return true;
}

#define AW 855
#define AH 511
#define BW 1013
#define Blocknum 32//32
#define Threadnum 256//256

#define AW 2000
#define AH 2000
#define BW 2000
#define Blocknum 445//32
#define Threadnum 445//256

typedef struct
{
int width;
int height;
int *element;
}matrix;
Matrix Initmatrix (int w, int h)
{
Matrix T;
t.element= (int *) malloc (w * h * sizeof (int));
for (int i=0 i < w*h i + +)
t.element[i]= rand ()% 10;
T.width=w;
T.height=h;
return t;
}
Matrix MM (Matrix A, matrix B)
{
Matrix T;
t.element= (int *) malloc (a.height * b.width * sizeof (int));
T.width=b.width;
T.height=a.height;
int x;
int y;
for (int i =0 i < t.width * T.height; i + +)
{
X=i/t.width * A.WIDTH;
Y=i-i/t.width * t.width;
t.element[i]=0;
for (int k = 0; k < A. width; k + +)
{
T.element[i] + = a.element[x + K] * b.element [y +b.width * k];
}
}
return t;
}

__global__ static void Matrixmul (int *ma, int *mb, int *mc, int *mp)
{
int aw = mp[0];
int bw = mp[2];
int cw = mp[4];
int ch = mp[5];
const int bid = blockidx.x;
const int TID = threadidx.x;
int i, x, y;

for (i = bid * Threadnum + TID i < CW * CH; i + = Threadnum * blocknum)
{
x = I/CW * AW;
y = i-i/cw * CW;
Mc[i] = 0;
for (int k = 0; k < aw; k + +)
{
Mc[i] + = ma[x + K] * mb[y + k * bw];
}
}
}
extern "C" void Gpucputest ()
{
if (! Initcuda ()) {
Return
}
Defining matrices
int matrixa[n][n], matrixb[n][n], matrixc[n][n], gpuresult[n][n], matrixd[n][n];
Matrix Matrixa=initmatrix (Aw,ah);
Matrix Matrixb=initmatrix (BW,AW);
Matrix Matrixc;
Matrix Gpuresult=initmatrix (Bw,ah);

int matrixprop[6];

Timing for CPU operations
unsigned int timer1 = 0;
CPU matrix multiplication
int start = clock ();
MATRIXC=MM (MATRIXA,MATRIXB);
int finish = clock ();
printf ("CPU time =%d\n", Finish-start);
start = Clock ();
Matrixprop[0] = matrixa.width;
MATRIXPROP[1] = matrixa.height;
MATRIXPROP[2] = matrixb.width;
MATRIXPROP[3] = matrixb.height;
MATRIXPROP[4] = matrixc.width;
MATRIXPROP[5] = matrixc.height;

Request Video Memory
int *ma, *MB, *MC, *MP;
Cudamalloc ((void**) &ma, sizeof (int) * matrixa.width * matrixa.height);
Cudamalloc ((void**) &AMP;MB, sizeof (int) * matrixb.width * matrixb.height);
Cudamalloc ((void**) &AMP;MC, sizeof (int) * matrixc.width * matrixc.height);
Cudamalloc ((void**) &mp, sizeof (int) * 6);
Copy data to Memory
cudamemcpy (MA, matrixa.element, sizeof (int) * Matrixa.width * matrixa.height, Cudamemcpyhosttodevice);
cudamemcpy (MB, matrixb.element, sizeof (int) * Matrixb.width * matrixb.height, Cudamemcpyhosttodevice);
cudamemcpy (MP, Matrixprop, sizeof (int) * 6, Cudamemcpyhosttodevice);
unsigned int timer2 = 0;
Calling the Cuda function
Matrixmul <<< blocknum, threadnum, 0 >>> (MA, MB, MC, MP);
Cudathreadsynchronize ();
Cutilcheckerror (Cutstoptimer (timer2));
Copy data out of memory
cudamemcpy (Gpuresult.element, MC, sizeof (int) * Gpuresult.width * gpuresult.height, cudamemcpydevicetohost);
finish = Clock ();
printf ("GPU time =%d\n", Finish-start);

for (int i =0 i < gpuresult.width * Gpuresult.height; i + +)
{
printf ("%d--%d\n", matrixc.element[i],gpuresult.element[i));
if (Matrixc.element[i]!= gpuresult.element[i])
{
printf ("ERROR");
}
}

Cudafree (MA);
Cudafree (MB);
Cudafree (MC);
Cudafree (MP);
System ("pause");
}

When calling in a CPP program, notice to add the extern "C" where C caps

You can then run (call the function defined in the CU or Cuh in a CPP or H file and invoke it without using the Include method, in an extern way to declare the CUDA function defined in a CU or Cuh file as a public function, declaring an extern function in CPP or H, Then you can use it directly.

The above program runs as follows:

The writing is more coarse, mainly to leave a mark.

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

C + + mixed implementation of C + + GPU program in VS Cuda

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

C + + mixed implementation of C + + GPU program in VS Cuda

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support