After studying CUDA over the past few days, we found that its parallel thinking is not the same as that of General CPU multithreading, but it is still quite good. The task is divided into blocks, and each block is divided into threads. Then each thread makes its own
Events. This parallel idea is very suitable for operations between elements such as matrix operations that are not very coupled, but the overall data is very large. This is just my initial feeling of CUDA.
The CPU program for matrix multiplication is as follows:
// C = A * B
Void MatrixMulCPU (float * _ C, const float * _ A, const float * _ B, int _ wa, int _ ha, int _ wb)
{
Float sum = 0;
For (int I = 0; I <_ ha; ++ I)
{
For (int j = 0; j <_ wb; ++ j)
{
Sum = 0;
For (int k = 0; k <_ wa; ++ k)
{
Sum + = (float) _ A [I * _ wa + k] * (float) _ B [k * _ wb + j];
}
_ C [I * _ wb + j] = (float) sum;
}
}
}
From the above, we can see that C (I, j) = sum {A (I, k) * B (k, j)} 0 <= k <_ wa; the coupling degree is very small, therefore, we can let each thread take responsibility for a region by dividing the region.
How to divide it? The first idea was to let every thread calculate a C (I, j). To estimate it, height_c * width_c, that is, ha * wb threads, should be required. Furthermore, we divide the matrix into a Grid with a large square.
The Grid size is 16*16, so the 80*48 matrix can be expressed as 5 (* 16) * 3 (* 16), that is, 16*16 large grids (blocks ), in each grid, it is naturally (height_c/16) * (width_c/16) threads.
Okay. After division, the kernel code is as follows:
Computing version 0:
_ Global _ void matrix_kernel_0 (float * _ C, const float * _ A, const float * _ B, int _ wa, int _ wb)
{
Float sum = 0;
// Locate the row and column of the thread
Int row = blockIdx. y * blockDim. y + threadIdx. y;
Int col = blockIdx. x * blockDim. x + threadIdx. x;
// Thread (row, col) is responsible for calculating C (row, col)
For (int I = 0; I <_ wa; ++ I)
{
Sum + = _ A [row * _ wa + I] * _ B [I * _ wb + col];
}
_ C [row * _ wb + col] = sum;
}
Another idea is that we will not allow every thread to fully calculate a c (I, j), through C (I, j) = sum {A (I, k) * B (k, j)} we found that we can further divide the granularity:
Csub (I, j) = sum {A (I, ksub + offsetA) * B (ksub + offsetB, j)} 0 <= ksub <blockSize
C (I, j) = sum {Csub (I, j )}
It is to divide the matrix into n * n large sub-blocks, and each block is responsible for calculating the sub-product of sub-block I and sub-block j. After calculation, it can be added up. Shared Memory is used for optimization.
Computing version 1:
_ Global _ void matrix_kernel_1 (float * _ C, const float * _ A, const float * _ B, int _ wa, int _ wb)
{
Int bx = blockIdx. x;
Int by = blockIdx. y;
Int tx = threadIdx. x;
Int ty = threadIdx. y;
// A of the block to be processed
Int aBegin = _ wa * (by * BLOCK_SIZE); // A (0,)
Int aEnd = aBegin + _ wa-1;
Int aStep = BLOCK_SIZE; // offsetA
Int bBegin = BLOCK_SIZE * bx; // B (bx, 0)
Int bStep = BLOCK_SIZE * _ wb; // offsetB
Float cSub = 0;
For (int a = aBegin, B = bBegin; a <= aEnd; a + = aStep, B + = bStep)
{
_ Shared _ float As [BLOCK_SIZE] [BLOCK_SIZE];
_ Shared _ float Bs [BLOCK_SIZE] [BLOCK_SIZE];
// Each thread is responsible for copying an element
As [ty] [tx] = _ A [a + _ wa * ty + tx];
Bs [ty] [tx] = _ B [B + _ wb * ty + tx];
_ Syncthreads ();
// Each thread is responsible for calculating the subproduct of a sub-block I and sub-block j.
For (int k = 0; k <BLOCK_SIZE; ++ k)
{
CSub + = As [ty] [k] * Bs [k] [tx];
}
_ Syncthreads ();
}
// Global address, which is written back to the Global Register
// One thread is responsible for one element, and one block is responsible for one sub-block
Int cIndex = (by * BLOCK_SIZE + ty) * _ wb + (bx * BLOCK_SIZE + tx );
_ C [cIndex] = cSub;
}
Finally, write a Host-oriented interface function:
Void matrixMulGPU (float * _ C, const float * _ A, const float * _ B, int _ wa, int _ ha, int _ wb)
{
Float * d_a = myNewOnGPU <float> (_ wa * _ ha );
Float * d_ B = myNewOnGPU <float> (_ wb * _ wa );
Float * d_c = myNewOnGPU <float> (_ wb * _ ha );
CopyFromCPUToGPU (_ A, d_a, _ wa * _ ha );
CopyFromCPUToGPU (_ B, d_ B, _ wb * _ wa );
Dim3 threads (BLOCK_SIZE, BLOCK_SIZE );
Dim3 blocks (WC/BLOCK_SIZE, HC/BLOCK_SIZE );
Matrix_kernel_0 <blocks, threads> (d_c, d_a, d_ B, _ wa, _ wb );
CudaThreadSynchronize ();
CopyFromGPUToCPU (d_c, _ C, _ wb * _ ha );
MyDeleteOnGPU (d_a );
MyDeleteOnGPU (d_ B );
MyDeleteOnGPU (d_c );
}
The main function called is as follows:
# Include <stdio. h>
# Include <cuda_runtime.h>
# Include <cutil. h>
# Include <cutil_inline.h>
# Include <stdlib. h>
# Include <time. h>
# Include <math. h>
# Include <string. h>
# Include <Windows. h>
# Include "CUDACommon. h"
# Include "MatrixMulCPU. h"
# Include "MatrixMulGPU. h"
Void randomInit (float * _ data, int _ size)
{
For (int I = 0; I <_ size; ++ I)
{
_ Data [I] = rand ()/(float) RAND_MAX;
}
}
Bool checkError (const float * _ A, const float * _ B, int _ size)
{
For (int I = 0; I <_ size; ++ I)
{
If (fabs (_ A [I]-_ B [I])> 1.0e-3)
{
Printf ("% f \ t % f \ n", _ A [I], _ B [I]);
Return false;
}
}
Return true;
}
Int main (int argc, char * argv [])
{
Srand (13 );
If (! InitCUDA ()){
Return 0;
}
Float * A = myNewOnCPU <float> (WA * HA );
Float * B = myNewOnCPU <float> (WB * HB );
RandomInit (A, WA * HA );
RandomInit (B, WB * HB );
Float * C = myNewOnCPU <float> (WC * HC );
Memset (C, 0, sizeof (float) * WC * HC );
Float * C2 = myNewOnCPU <float> (WC * HC );
Memset (C2, 0, sizeof (float) * WC * HC );
Unsigned int tick1 = GetTickCount ();
MatrixMulCPU (C2, A, B, WA, HA, WB );
Printf ("CPU use Time: % dms \ n", GetTickCount ()-tick1 );
Unsigned int timer = 0;
CutilCheckError (cutCreateTimer (& timer ));
CutilCheckError (cutStartTimer (timer ));
{
MatrixMulGPU (C, A, B, WA, HA, WB );
}
CutilCheckError (cutStopTimer (timer ));
Printf ("GPU use time: % f (MS) \ n", cutGetTimerValue (timer ));
CutilCheckError (cutDeleteTimer (timer ));
If (checkError (C, C2, WC * HC ))
{
Printf ("Accept \ n ");
}
Else
{
Printf ("Worng Answer \ n ");
}
MyDeleteOnCPU ();
MyDeleteOnCPU (B );
MyDeleteOnCPU (C );
MyDeleteOnCPU (C2 );
Return 0;
}
The calculation result is as follows:
Version 0:
Version 1:
It can be seen that the GPU parallel performance is much better than the CPU, and Version 1 is better than version 0.