# Include "cuda_runtime.h"
# Include "device_launch_parameters.h"
# Include <stdio. h>
# Include <stdlib. h>
# Include "cublas_v2.h"
# Define block_size 16
/***************/
Using the built-in function API of cublas, cublassgemm
Cudaerror_t multiwithcublase (float * C, float * a, float * B, unsigned int ah, unsigned int aw, unsigned int BH, unsigned int BW );
{
..................
Cublashandle_t handle;
Cublasstatus_t ret;
Ret = cublascreate (& handle );
Const float alpha = 1.0f;
Const float Beta = 0.0f;
Ret = cublassgemm (handle, cublas_op_n, cublas_op_n, ah, BW, aw, & Alpha, gpu_a, ah, gpu_ B, BH, & beta, gpu_c, AH );
..................
}
/***************/
Auxiliary with shared memory
_ Global _ void multikernel (float * C, float * a, float * B, unsigned int aw, unsigned int BW)
{
Int xblock = blockidx. X;
Int yblock = blockidx. Y;
Int xthread = threadidx. X;
Int ythread = threadidx. Y;
Unsigned int awidth = aw;
Unsigned int bwidth = bw;
Float cvalue = 0;
For (INT I = 0; I <awidth/block_size; ++ I)
{
_ Shared _ int asub [block_size] [block_size];
_ Shared _ int bsub [block_size] [block_size];
Asub [ythread] [xthread] = A [(yblock * blockdim. Y + ythread) * awidth + I * blockdim. x + xthread];
Bsub [ythread] [xthread] = B [(I * blockdim. Y + ythread) * bwidth + xblock * blockdim. x + xthread];
_ Syncthreads ();
For (int e = 0; e <block_size; ++ E)
{
Cvalue + = asub [ythread] [e] * bsub [E] [xthread];
}
_ Syncthreads ();
}
Int cIndex = (yblock * blockdim. Y + ythread) * bwidth + xblock * blockdim. x + xthread;
C [cIndex] = cvalue;
}
/***************/
Use shared memory to assist and enable Loop
_ Global _ void multikernel_noloop (float * C, float * a, float * B, unsigned int aw, unsigned int BW)
{
Int xblock = blockidx. X;
Int yblock = blockidx. Y;
Int xthread = threadidx. X;
Int ythread = threadidx. Y;
Unsigned int awidth = aw;
Unsigned int bwidth = bw;
Float cvalue = 0;
For (INT I = 0; I <awidth/block_size; ++ I)
{
_ Shared _ int asub [block_size] [block_size];
_ Shared _ int bsub [block_size] [block_size];
Asub [ythread] [xthread] = A [(yblock * blockdim. Y + ythread) * awidth + I * blockdim. x + xthread];
Bsub [ythread] [xthread] = B [(I * blockdim. Y + ythread) * bwidth + xblock * blockdim. x + xthread];
_ Syncthreads ();
Cvalue + = asub [ythread] [0] * bsub [0] [xthread] + asub [ythread] [1] * bsub [1] [xthread] + \
Asub [ythread] [2] * bsub [2] [xthread] + asub [ythread] [3] * bsub [3] [xthread] + \
Asub [ythread] [4] * bsub [4] [xthread] + asub [ythread] [5] * bsub [5] [xthread] + \
Asub [ythread] [6] * bsub [6] [xthread] + asub [ythread] [7] * bsub [7] [xthread] + \
Asub [ythread] [8] * bsub [8] [xthread] + asub [ythread] [9] * bsub [9] [xthread] + \
Asub [ythread] [10] * bsub [10] [xthread] + asub [ythread] [11] * bsub [11] [xthread] + \
Asub [ythread] [12] * bsub [12] [xthread] + asub [ythread] [13] * bsub [13] [xthread] + \
Asub [ythread] [14] * bsub [14] [xthread] + asub [ythread] [15] * bsub [15] [xthread];
_ Syncthreads ();
}
Int cIndex = (yblock * blockdim. Y + ythread) * bwidth + xblock * blockdim. x + xthread;
C [cIndex] = cvalue;
}
Matrix size: 320*320:
512*512:
PS machines are too bad.