# Include "cuda_runtime.h"
# Include "device_launch_parameters.h"
# Include <stdio. h>
# Include <stdlib. h>
# Include <time. h>
# Include "cublas_v2.h"
# Define block_size 16
Cudaerror_t multicuda (float * C, float * a, float * B, unsigned int ah, unsigned int aw, unsigned int BH, unsigned int BW );
_ Global _ void multikernel (float * C, float * a, float * B, unsigned int aw, unsigned int BW)
{
// Saved in register
Int xblock = blockidx. X;
Int yblock = blockidx. Y;
Int xthread = threadidx. X;
Int ythread = threadidx. Y;
Unsigned int awidth = aw;
Unsigned int bwidth = bw;
Float cvalue = 0;
For (INT I = 0; I <awidth/block_size; ++ I)
{
_ Shared _ int asub [block_size] [block_size];
_ Shared _ int bsub [block_size] [block_size];
Asub [ythread] [xthread] = A [(yblock * blockdim. Y + ythread) * awidth + I * blockdim. x + xthread];
Bsub [ythread] [xthread] = B [(I * blockdim. Y + ythread) * bwidth + xblock * blockdim. x + xthread];
_ Syncthreads ();
For (int e = 0; e <block_size; ++ E)
{
Cvalue + = asub [ythread] [e] * bsub [E] [xthread];
}
_ Syncthreads ();
}
Int cIndex = (yblock * blockdim. Y + ythread) * bwidth + xblock * blockdim. x + xthread;
C [cIndex] = cvalue;
}
_ Global _ void multikernel_noloop (float * C, float * a, float * B, unsigned int aw, unsigned int BW)
{
Int xblock = blockidx. X;
Int yblock = blockidx. Y;
Int xthread = threadidx. X;
Int ythread = threadidx. Y;
Unsigned int awidth = aw;
Unsigned int bwidth = bw;
Float cvalue = 0;
For (INT I = 0; I <awidth/block_size; ++ I)
{
_ Shared _ int asub [block_size] [block_size];
_ Shared _ int bsub [block_size] [block_size];
Asub [ythread] [xthread] = A [(yblock * blockdim. Y + ythread) * awidth + I * blockdim. x + xthread];
Bsub [ythread] [xthread] = B [(I * blockdim. Y + ythread) * bwidth + xblock * blockdim. x + xthread];
_ Syncthreads ();
Cvalue + = asub [ythread] [0] * bsub [0] [xthread] + asub [ythread] [1] * bsub [1] [xthread] + \
Asub [ythread] [2] * bsub [2] [xthread] + asub [ythread] [3] * bsub [3] [xthread] + \
Asub [ythread] [4] * bsub [4] [xthread] + asub [ythread] [5] * bsub [5] [xthread] + \
Asub [ythread] [6] * bsub [6] [xthread] + asub [ythread] [7] * bsub [7] [xthread] + \
Asub [ythread] [8] * bsub [8] [xthread] + asub [ythread] [9] * bsub [9] [xthread] + \
Asub [ythread] [10] * bsub [10] [xthread] + asub [ythread] [11] * bsub [11] [xthread] + \
Asub [ythread] [12] * bsub [12] [xthread] + asub [ythread] [13] * bsub [13] [xthread] + \
Asub [ythread] [14] * bsub [14] [xthread] + asub [ythread] [15] * bsub [15] [xthread];
_ Syncthreads ();
}
Int cIndex = (yblock * blockdim. Y + ythread) * bwidth + xblock * blockdim. x + xthread;
C [cIndex] = cvalue;
}
Cudaerror_t multiwithcublase (float * C, float * a, float * B, unsigned int ah, unsigned int aw, unsigned int BH, unsigned int BW );
Void multicpu (float * C, float * a, float * B, unsigned int ah, unsigned int aw, unsigned int BH, unsigned int BW );
Int main ()
{
Const unsigned int Ah = 320;
Const unsigned int aw = 320;
Const unsigned int BW = 320;
Const unsigned int bH = aw;
Const unsigned int CH = Ah;
Const unsigned int CW = bw;
Float * cpu_a, * cpu_ B, * cpu_c;
Cpu_a = (float *) malloc (ah * Aw * sizeof (float ));
Cpu_ B = (float *) malloc (BH * BW * sizeof (float ));
Cpu_c = (float *) malloc (Ch * Cw * sizeof (float ));
For (INT y = 0; y <Ah; ++ y)
{
For (INT x = 0; x <aw; ++ X)
{
Int Index = y * Aw + X;
Cpu_a [Index] = (float) (x <Y? X: Y );
}
}
For (INT y = 0; y <BH; ++ y)
{
For (INT x = 0; x <bw; ++ X)
{
Int Index = y * BW + X;
Cpu_ B [Index] = (float) (x <Y? X: Y );
}
}
Cudaerror_t cudastatus = multicuda (cpu_c, cpu_a, cpu_ B, ah, aw, BH, BW );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "multicuda failed! ");
Return 1;
}
/*
For (INT y = 0; y <ch; ++ y)
{
For (INT x = 0; x <CW; ++ X)
{
If (x = 1 & Y = 1)
{
Int Index = y * CW + X;
Printf ("C (1, 1) = %. 1f \ n", cpu_c [Index]);
}
}
}
*/
Cudastatus = multiwithcublase (cpu_c, cpu_a, cpu_ B, ah, aw, BH, BW );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "multicuda failed! ");
Return 1;
}
/*
For (INT y = 0; y <ch; ++ y)
{
For (INT x = 0; x <CW; ++ X)
{
If (x = 1 & Y = 1)
{
Int Index = y * CW + X;
Printf ("C (1, 1) = %. 1f \ n", cpu_c [Index]);
}
}
}
*/
// Cudadevicereset must be called before exiting in order for profiling and
// Tracing tools such as nsight and visual profiler to show complete traces.
Cudastatus = cudadevicereset ();
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudadevicereset failed! ");
Return 1;
}
Float start, end;
Start = clock ();
Multicpu (cpu_c, cpu_a, cpu_ B, ah, aw, BH, BW );
End = clock ();
Printf ("CPU Runtime is % F msec \ n", end-Start );
Free (cpu_a );
Free (cpu_ B );
Free (cpu_c );
Getchar ();
Return 0;
}
Cudaerror_t multicuda (float * C, float * a, float * B, unsigned int ah, unsigned int aw, unsigned int BH, unsigned int BW)
{
Float * gpu_a = 0;
Float * gpu_ B = 0;
Float * gpu_c = 0;
Cudaerror_t cudastatus;
Cudastatus = cudasetdevice (0 );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudasetdevice failed! Do you have a cuda-capable GPU installed? ");
Goto error;
}
Size_t size_a = ah * Aw * sizeof (float );
Cudastatus = cudamalloc (void **) & gpu_a, size_a );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamalloc failed! ");
Goto error;
}
Size_t size_ B = BH * BW * sizeof (float );
Cudastatus = cudamalloc (void **) & gpu_ B, size_ B );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamalloc failed! ");
Goto error;
}
Size_t size_c = ah * BW * sizeof (float );
Cudastatus = cudamalloc (void **) & gpu_c, size_c );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamalloc failed! ");
Goto error;
}
Cudastatus = cudamemcpy (gpu_a, A, size_a, cudamemcpyhosttodevice );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamemcpy failed! ");
Goto error;
}
Cudastatus = cudamemcpy (gpu_ B, B, size_ B, cudamemcpyhosttodevice );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamemcpy failed! ");
Goto error;
}
Dim3 blocks (block_size, block_size );
Dim3 grids (bw/block_size, AH/block_size );
Cudaevent_t start;
Cudastatus = cudaeventcreate (& START );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to create start event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Cudaevent_t stop;
Cudastatus = cudaeventcreate (& stop );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to create stop event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Cudastatus = cudaeventrecord (START, null );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to record start event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Multikernel <grids, blocks> (gpu_c, gpu_a, gpu_ B, aw, BW );
Cudastatus = cudaeventrecord (STOP, null );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to record stop event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Cudastatus = cudaeventsynchronize (STOP );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to synchronize on the Stop event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Float msectotal = 0.0f;
Cudastatus = cudaeventelapsedtime (& msectotal, start, stop );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to get time elapsed between events (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Printf ("HS _ GPU Runtime is % F msec \ n", msectotal );
/*************************************** ****************/
Cudaevent_t start1;
Cudastatus = cudaeventcreate (& start1 );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to create start event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Cudaevent_t Stop1;
Cudastatus = cudaeventcreate (& Stop1 );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to create stop event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Cudastatus = cudaeventrecord (start1, null );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to record start event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Multikernel_noloop <grids, blocks> (gpu_c, gpu_a, gpu_ B, aw, BW );
Cudastatus = cudaeventrecord (Stop1, null );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to record stop event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Cudastatus = cudaeventsynchronize (Stop1 );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to synchronize on the Stop event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Float msectotal1 = 0.0f;
Cudastatus = cudaeventelapsedtime (& msectotal1, start1, Stop1 );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to get time elapsed between events (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Printf ("HS _ noloop GPU Runtime is % F msec \ n", msectotal1 );
/*************************************** ********************/
Cudastatus = cudagetlasterror ();
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "addkernel launch failed: % s \ n", cudageterrorstring (cudastatus ));
Goto error;
}
// Cudadevicesynchronize waits for the kernel to finish, and returns
// Any errors encountered during the launch.
Cudastatus = cudadevicesynchronize ();
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudadevicesynchronize returned error code % d after launching addkernel! \ N ", cudastatus );
Goto error;
}
Cudastatus = cudamemcpy (C, gpu_c, size_c, cudamemcpydevicetohost );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamemcpy failed! ");
Goto error;
}
Error:
Cudafree (gpu_a );
Cudafree (gpu_ B );
Cudafree (gpu_c );
Return cudastatus;
}
Void inline checkerror (cublasstatus_t status, const char * MSG)
{
If (status! = Cublas_status_success)
{
Printf ("% s", MSG );
Exit (exit_failure );
}
}
Cudaerror_t multiwithcublase (float * C, float * a, float * B, unsigned int ah, unsigned int aw, unsigned int BH, unsigned int BW)
{
Float * gpu_a = 0;
Float * gpu_ B = 0;
Float * gpu_c = 0;
Cudaerror_t cudastatus;
Cudastatus = cudasetdevice (0 );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudasetdevice failed! Do you have a cuda-capable GPU installed? ");
Goto error;
}
Size_t size_a = ah * Aw * sizeof (float );
Cudastatus = cudamalloc (void **) & gpu_a, size_a );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamalloc failed! ");
Goto error;
}
Size_t size_ B = BH * BW * sizeof (float );
Cudastatus = cudamalloc (void **) & gpu_ B, size_ B );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamalloc failed! ");
Goto error;
}
Size_t size_c = ah * BW * sizeof (float );
Cudastatus = cudamalloc (void **) & gpu_c, size_c );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamalloc failed! ");
Goto error;
}
Cudastatus = cudamemcpy (gpu_a, A, size_a, cudamemcpyhosttodevice );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamemcpy failed! ");
Goto error;
}
Cudastatus = cudamemcpy (gpu_ B, B, size_ B, cudamemcpyhosttodevice );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamemcpy failed! ");
Goto error;
}
Dim3 blocks (block_size, block_size );
Dim3 grids (bw/block_size, AH/block_size );
// Printf ("computing result using cublas... \ n ");
Cublashandle_t handle;
Cublasstatus_t ret;
Ret = cublascreate (& handle );
If (Ret! = Cublas_status_success)
{
Printf ("cublascreate returned error code % d, line (% d) \ n", RET, _ line __);
Goto error;
}
Const float alpha = 1.0f;
Const float Beta = 0.0f;
Cudaevent_t start;
Cudastatus = cudaeventcreate (& START );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to create start event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Cudaevent_t stop;
Cudastatus = cudaeventcreate (& stop );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to create stop event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Cudastatus = cudaeventrecord (START, null );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to record start event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Ret = cublassgemm (handle, cublas_op_n, cublas_op_n, ah, BW, aw, & Alpha, gpu_a, ah, gpu_ B, BH, & beta, gpu_c, AH );
Cudastatus = cudaeventrecord (STOP, null );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to record stop event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Cudastatus = cudaeventsynchronize (STOP );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to synchronize on the Stop event (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Float msectotal = 0.0f;
Cudastatus = cudaeventelapsedtime (& msectotal, start, stop );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "failed to get time elapsed between events (error code % s )! \ N ", cudageterrorstring (cudastatus ));
Goto error;
}
Printf ("cublas _ GPU Runtime is % F msec \ n", msectotal );
Cudastatus = cudamemcpy (C, gpu_c, size_c, cudamemcpydevicetohost );
If (cudastatus! = Cudasuccess ){
Fprintf (stderr, "cudamemcpy failed! ");
Goto error;
}
Checkerror (cublasdestroy (handle), "cublasdestroy () error! \ N ");
Error:
Cudafree (gpu_a );
Cudafree (gpu_ B );
Cudafree (gpu_c );
Return cudastatus;
}
Void multicpu (float * C, float * a, float * B, unsigned int ah, unsigned int aw, unsigned int BH, unsigned int BW)
{
For (INT y = 0; y <Ah; ++ y)
{
For (INT x = 0; x <bw; ++ X)
{
Int Index = y * BW + X;
C [Index] = 0.0f;
For (INT I = 0; I <aw; ++ I)
{
C [Index] + = A [y * Aw + I] * B [I * BW + x];
}
}
}
}