#include "cuda_runtime.h"
#include "Device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include "Cublas_v2.h"
void Multicpu (float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{
printf ("\ n");
printf ("Matrix a<%2d,%2d> = \ n", Ah,aw);
for (int y=0; y<ah; ++y)
{
for (int x =0; x<aw; ++x)
{
int index = Y*aw + x;
printf ("%8.1f", A[index]);
}
printf ("\ n");
}
printf ("\ n");
printf ("Matrix b<%2d,%2d> = \ n", BH,BW);
for (int y=0; y<bh; ++y)
{
for (int x =0; x<bw; ++x)
{
int index = Y*BW + x;
printf ("%8.1f", B[index]);
}
printf ("\ n");
}
printf ("\ n");
printf ("Matrix a*b<%2d,%2d> = \ n", AH,BW);
for (int y=0; y<ah; ++y)
{
for (int x =0; x<bw; ++x)
{
int index = Y*BW + x;
C[index] = 0.0f;
for (int i=0; i<aw; ++i)
{
C[index] + = A[Y*AW+I]*B[I*BW + x];
}
printf ("%8.1f", C[index]);
}
printf ("\ n");
}
printf ("\ n");
}
void Trans (float *a, unsigned int aH, unsigned int aW)
{
float* tr = (float*) malloc (sizeof (float) *ah*aw);
int count = 0;
for (int x = 0; x <aW; ++x)
{
for (int y=0; y<ah; ++y)
{
int index = Y*aw + x;
Tr[count] = A[index];
count++;
}
}
for (int i = 0; i<count;i++)
{
A[i] = Tr[i];
}
Free (TR);
for (int y=0; y < AW; ++y)
{
for (int x =0; x < AH; ++x)
{
int index = Y*ah + x;
printf ("%8.1f", A[index]);
}
printf ("\ n");
}
printf ("\ n");
}
int main ()
{
const int ahight = 3, awidth = 5;
const int bhight = 5, bwidth = 4;
Float A[ahight*awidth] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
Float B[bhight*bwidth] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
Float C[ahight*bwidth] = {0};
Float C_cublas[ahight*bwidth] = {0};
MULTICPU (c, a, B, Ahight,awidth, Bhight, bwidth);
float *gpu_a = 0;
float *gpu_b = 0;
float *gpu_c = 0;
cudaerror_t Cudastatus;
Cudastatus = Cudasetdevice (0);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudasetdevice failed! Do you have a cuda-capable GPU installed? ");
Goto Error;
}
Cudastatus = Cudamalloc ((void**) &gpu_a,ahight*awidth*sizeof (float));
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudamalloc failed!");
Goto Error;
}
Cudastatus = Cudamalloc ((void**) &gpu_b,bhight*bwidth*sizeof (float));
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudamalloc failed!");
Goto Error;
}
Cudastatus = Cudamalloc ((void**) &gpu_c,ahight*bwidth*sizeof (float));
if (cudastatus! = cudasuccess) {
fprintf (stderr, "Cudamalloc failed!");
Goto Error;
}
Cudastatus = cudamemcpy (Gpu_a, A, ahight*awidth*sizeof (float), cudamemcpyhosttodevice);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "cudamemcpy failed!");
Goto Error;
}
Cudastatus = cudamemcpy (Gpu_b, b,bhight*bwidth*sizeof (float), cudamemcpyhosttodevice);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "cudamemcpy failed!");
Goto Error;
}
printf ("Computing result using cublas...\n");
cublashandle_t handle;
cublasstatus_t ret;
ret = Cublascreate (&handle);
if (ret! = cublas_status_success) {
printf ("Cublascreate returned error code%d, line (%d) \ n", ret, __line__);
Goto Error;
}
Const float alpha = 1.0f;
Const float beta = 0.0f;
ret = CUBLASSGEMM (handle, cublas_op_t, cublas_op_t, Ahight, Bwidth, Awidth, &alpha, Gpu_a, Awidth, Gpu_b, BWidth, & ; beta, Gpu_c, ahight);
Cudastatus = cudamemcpy (C_cublas, Gpu_c, ahight*bwidth*sizeof (float), cudamemcpydevicetohost);
if (cudastatus! = cudasuccess) {
fprintf (stderr, "cudamemcpy failed!");
Goto Error;
}
Cublasdestroy (handle);
/*
Trans (b,bhight,bwidth);
Trans (a,ahight,awidth);
Multicpu (c, B, A, bwidth, Bhight, Awidth, ahight);
*/
printf ("\ncublassgemm" (Handle, cublas_op_t, cublas_op_t, Ahight, Bwidth, Awidth, &alpha, Gpu_a, Awidth, Gpu_b, BWidth , &beta, Gpu_c, ahight); \ n ");
printf ("c_cublas<%2d,%2d> = \ n", bwidth,ahight);
for (int y=0; y<bwidth; ++y)
{
for (int x=0; x<ahight; ++x)
{
int index = y*ahight + x;
printf ("%8.1f", C_cublas[index]);
}
printf ("\ n");
}
printf ("\ n");
printf ("After trans:c_cublas<%2d,%2d> = \ n", ahight,bwidth);
Trans (c_cublas,bwidth,ahight);
printf ("\ n");
Error:
Cudafree (gpu_a);
Cudafree (Gpu_b);
Cudafree (Gpu_c);
return 0;
}