/*************************************** *****************************
Created: 2009/09/19
Created: PM
Filename: vector_matrix_multiplication.cu
File base: vector_matrix_multiplication
File Ext: Cuda
Author: Zhao. kaiyong (AT) gmail.com
Purpose: Vector Matrix Multiplication
Copyright: everyone can use this code, please specify source.
For any use, please indicate the source;
Http://www.hpctech.com
Http://openhero.net
**************************************** *****************************/
# Include
# Include
# Include
# Include
/*************************************** *********************************/
/* Init Cuda */
/*************************************** *********************************/
# If _ device_emulation __
Bool initcuda (void) {return true ;}
# Else
Bool initcuda (void)
{
Int COUNT = 0;
Int I = 0;
Cudagetdevicecount (& COUNT );
If (COUNT = 0 ){
Fprintf (stderr, "there is no device./N ");
Return false;
}
For (I = 0; I cudadeviceprop prop;
If (cudagetdeviceproperties (& prop, I) = cudasuccess ){
If (prop. Major> = 1 ){
Break;
}
}
}
If (I = count ){
Fprintf (stderr, "there is no device supporting Cuda./N ");
Return false;
}
Cudasetdevice (I );
Printf ("Cuda initialized./N ");
Return true;
}
# Endif
/*************************************** *********************************/
/* Example */
/*************************************** *********************************/
_ Global _ static void vector_matrix_mult_kernel (float * a, long wa, float * B, long WB, float * C)
{
_ Shared _ float Suba [64];
A = a + threadidx. X;
B = B + blockidx. x * 64 + threadidx. X;
C = C + blockidx. x * 64 + threadidx. X;
Float subc = 0.0;
For (INT I = 0; I {
Suba [threadidx. x] = A [I];
_ Syncthreads ();
# Pragma unroll
For (Int J = 0; J {
Subc + = Suba [J] * B [0];
}
_ Syncthreads ();
}
C [0] = subc;
}
// _ Global _ static void vector_matrix_mult_kernel_32t (float * a, long wa, float * B, long WB, float * C)
//{
//}
# Define run_test
# Define wa 32
# Define WB 64
/*************************************** *********************************/
/* Hellocuda */
/*************************************** *********************************/
Int main (INT argc, char * argv [])
{
If (! Initcuda ()){
Return 0;
}
Srand (2009 );
Long WA = 64 * Wa;
Long WB = 64 * WB;
Long size_a = wa;
Long size_ B = wa * WB;
Long size_c = WB;
Float * Ha = (float *) malloc (sizeof (float) * size_a );
Float * HB = (float *) malloc (sizeof (float) * size_ B );
Float * HC = (float *) malloc (sizeof (float) * size_c );
Float * testhc = (float *) malloc (sizeof (float) * size_c );
For (INT I = 0; I {
Ha [I] = (float) rand ()/(float) rand_max;
}
For (INT I = 0; I {
HB [I] = (float) rand ()/(float) rand_max;
}
Float * da = 0;
Float * DB = 0;
Float * Dc = 0;
Cuda_safe_call (cudamalloc (void **) & Da, sizeof (float) * size_a ));
Cuda_safe_call (cudamalloc (void **) & dB, sizeof (float) * size_ B ));
Cuda_safe_call (cudamalloc (void **) & DC, sizeof (float) * size_c ));
Cuda_safe_call (cudamemcpy (DA, ha, sizeof (float) * size_a, cudamemcpyhosttodevice ));
Cuda_safe_call (cudamemcpy (dB, Hb, sizeof (float) * size_ B, cudamemcpyhosttodevice ));
Unsigned int timer = 0;
Cut_safe_call (cutcreatetimer (& timer ));
Cut_safe_call (cutstarttimer (timer ));
Dim3 threads = 64;
Dim3 blocks = WB/64;
Vector_matrix_mult_kernel> (DA, WA, DB, WB, DC );
Cut_check_error ("kernel execution failed/N ");
Cuda_safe_call (cudamemcpy (HC, DC, sizeof (float) * size_c, cudamemcpydevicetohost ));
Cut_safe_call (cutstoptimer (timer ));
Printf ("Processing Time: % F (MS)/n", cutgettimervalue (timer ));
Cut_safe_call (cutresettimer (timer ));
For (INT I = 0; I {
Float subc = 0.0;
For (Int J = 0; j {
Subc + = ha [J] * HB [J * WB + I];
}
Testhc [I] = subc;
}
Cut_safe_call (cutstoptimer (timer ));
Printf ("Processing Time: % F (MS)/n", cutgettimervalue (timer ));
Cut_safe_call (cutdeletetimer (timer ));
# Ifdef run_test
Cutboolean res = cutcomparel2fe (testhc, HC, size_c, 1e-6f );
Printf ("test % s/n", (1 = res )? "Passed": "failed ");
# Endif
Cuda_safe_call (cudafree (DA ));
Cuda_safe_call (cudafree (db ));
Cuda_safe_call (cudafree (DC ));
Free (HA );
Free (HB );
Free (HC );
Cut_exit (argc, argv );
Return 0;
}