Now, we start to write a simple opencl program, calculate the sum of the two arrays, and put them in another array. The program calculates with the CPU and GPU respectively, and finally verifies whether they are equal. The procedure of the opencl program is roughly as follows:
The main code in source code is as follows:
Int main (INT argc, char * argv [])
{
// Create three buffers in the host memory
Float * buf1 = 0;
Float * buf2 = 0;
Float * Buf = 0;
Buf1 = (float *) malloc (bufsize * sizeof (float ));
Buf2 = (float *) malloc (bufsize * sizeof (float ));
Buf = (float *) malloc (bufsize * sizeof (float ));
// Initialize buf1 and buf2 with some random values
Int I;
Srand (unsigned) Time (null ));
For (I = 0; I <bufsize; I ++)
Buf1 [I] = rand () %65535;
Srand (unsigned) Time (null) + 1000 );
For (I = 0; I <bufsize; I ++)
Buf2 [I] = rand () %65535;
// Compute the sum of buf1, buf2, and CPU.
For (I = 0; I <bufsize; I ++)
Buf [I] = buf1 [I] + buf2 [I];
Cl_uint status;
Cl_platform_id platform;
// Create a platform object
Status = clgetplatformids (1, & platform, null );
NOTE: If more than one opencl platform is installed in our system, for example, there are two opencl platforms in my OS: Intel and AMD. Using the above Code may cause errors, because it has obtained Intel's opencl platform, while Intel's platform only supports CPU, and our subsequent operations are based on GPU, we can use the following code to obtain AMD's opencl platform.
cl_uint numPlatforms;
std::string platformVendor;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
if(status != CL_SUCCESS)
{
return 0;
}
if (0 < numPlatforms)
{
cl_platform_id* platforms = new cl_platform_id[numPlatforms];
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
char platformName[100];
for (unsigned i = 0; i < numPlatforms; ++i)
{
status = clGetPlatformInfo(platforms[i],
CL_PLATFORM_VENDOR,
sizeof(platformName),
platformName,
NULL);
platform = platforms[i];
platformVendor.assign(platformName);
if (!strcmp(platformName, "Advanced Micro Devices, Inc."))
{
break;
}
}
std::cout << "Platform found : " << platformName << "\n";
delete[] platforms;
}
Cl_device_id device;
// Create a GPU Device
Clgetdeviceids (platform, cl_device_type_gpu,
1,
& Device,
Null );
// Create Context
Cl_context context = clcreatecontext (null,
1,
& Device,
Null, null, null );
// Create a command queue
Cl_command_queue queue = clcreatecommandqueue (context,
Device,
Cl_queue_profiling_enable, null );
// Create three opencl memory objects and copy buf1 content by implicit copying
// The buf1 content is copied to clbuf1, And the buf2 content is copied to clbuf2 through the display copy method.
Cl_mem clbuf1 = clcreatebuffer (context,
Cl_mem_read_only | cl_mem_copy_host_ptr,
Bufsize * sizeof (cl_float), buf1,
Null );
Cl_mem clbuf2 = clcreatebuffer (context,
Cl_mem_read_only,
Bufsize * sizeof (cl_float), null,
Null );
Cl_event writeevt;
Status = clenqueuewritebuffer (queue, clbuf2, 1,
0, bufsize * sizeof (cl_float), buf2, 0, 0, 0 );
The above code copies the content in buf2 to clbuf2. Because buf2 is located on the host and clbuf2 is located on the device, this function will perform a host-to-device transmission operation, or a copy operation from system memory to video memory, so I placed the clfush function behind this function to submit all the commands in the command queue to the device (note: this command does not ensure that the command execution is complete). Therefore, we call the waitforeventandrelease function to wait for the write buffer to complete. waitforeventandreleae is a user-defined function with the following content, the main code is to use the event to check whether our operation is complete. If the operation is not completed, the program will always block in this line of code, in addition, we can use the built-in clwaitforevents function in opencl to replace clflush and waitforeventandreleae.
// Wait until the event is completed
Int waitforeventandrelease (cl_event * event)
{
Cl_int status = cl_success;
Cl_int eventstatus = cl_queued;
While (eventstatus! = Cl_complete)
{
Status = clgeteventinfo (
* Event,
Cl_event_command_execution_status,
Sizeof (cl_int ),
& Eventstatus,
Null );
}
Status = clreleaseevent (* event );
Return 0;
}
Status = clflush (Queue );
// Wait until the data transmission is complete before proceeding
Waitforeventandrelease (& writeevt );
Cl_mem buffer = clcreatebuffer (context,
Cl_mem_write_only,
Bufsize * sizeof (cl_float ),
Null, null );
In the kernel file, the Code executed in the GPU is put in a separate file Add. Cl. In this program, the kernel code is very simple, but the two arrays are added. The kernel code is as follows:
__kernel void vecadd(__global const float* A, __global const float* B, __global float* C)
{
int id = get_global_id(0);
C[id] = A[id] + B[id];
}
// The Kernel File is add. cl.
Const char * filename = "Add. Cl ";
STD: String sourcestr;
Status = converttostring (filename, sourcestr );
Converttostring is also a user-defined function that reads the kernel source file into a string. Its code is as follows:
// Read the text file into a string to read the kernel source file
Int converttostring (const char * filename, STD: string & S)
{
Size_t size;
Char * STR;
STD: fstream F (filename, (STD: fstream: In | STD: fstream: Binary ));
If (F. is_open ())
{
Size_t filesize;
F. seekg (0, STD: fstream: End );
Size = filesize = (size_t) F. tellg ();
F. seekg (0, STD: fstream: Beg );
STR = new char [size + 1];
If (! Str)
{
F. Close ();
Return NULL;
}
F. Read (STR, filesize );
F. Close ();
STR [size] = '\ 0 ';
S = STR;
Delete [] STR;
Return 0;
}
Printf ("error: failed to open file % s \ n", filename );
Return 1;
}
Const char * Source = sourcestr. c_str ();
Size_t sourcesize [] = {strlen (source )};
// Create a program object
Cl_program program = clcreateprogramwithsource (
Context,
1,
& Source,
Sourcesize,
Null );
// Compile the program object
Status = clbuildprogram (Program, 1, & device, null );
If (status! = 0)
{
Printf ("clbuild failed: % d \ n", status );
Chartbuf [0x10000];
Clgetprogrambuildinfo (Program, device, cl_program_build_log, 0x10000, tbuf, null );
Printf ("\ n % s \ n", tbuf );
Return-1;
}
// Create a kernel object
Cl_kernel kernel = clcreatekernel (Program, "vecadd", null );
// Set the Kernel Parameter
Cl_int clnum = bufsize;
Clsetkernelarg (kernel, 0, sizeof (cl_mem), (void *) & clbuf1 );
Clsetkernelarg (kernel, 1, sizeof (cl_mem), (void *) & clbuf2 );
Clsetkernelarg (kernel, 2, sizeof (cl_mem), (void *) & buffer );
Note: When executing the kernel, we only set the number of global work items, but not the group size. At this time, the system will use the default work group size, which may be 256.
// Run the kernel command. The range value is 1 dimension, and the work itmes size is bufsize.
Cl_event EV;
Size_t global_work_size = bufsize;
Clenqueuendrangekernel (queue,
Kernel,
1,
Null,
& Global_work_size,
Null, 0, null, & eV );
Status = clflush (Queue );
Waitforeventandrelease (& eV );
// Copy data back to host memory
Cl_float * PTR;
Cl_event mapevt;
PTR = (cl_float *) clenqueuemapbuffer (queue,
Buffer,
Cl_true,
Cl_map_read,
0,
Bufsize * sizeof (cl_float ),
0, null );
Status = clflush (Queue );
Waitforeventandrelease (& mapevt );
// Verify the result and compare it with the CPU computing result
If (! Memcmp (BUF, PTR, bufsize ))
Printf ("verify passed \ n ");
Else printf ("verify failed ");
If (BUF)
Free (BUF );
If (buf1)
Free (buf1 );
If (buf2)
Free (buf2 );
After the program ends, these opencl objects are generally automatically released, but for the sake of completeness of the program, a good habit is developed. Here I add the code to manually release the opencl object.
// Delete the opencl resource object
Clreleasememobject (clbuf1 );
Clreleasememobject (clbuf2 );
Clreleasememobject (buffer );
Clreleaseprogram (Program );
Clreleasecommandqueue (Queue );
Clreleasecontext (context );
Return 0;
}
After the program is executed, the interface is as follows:
Complete code can be found:
Project File gcltutorial1
Download Code:
Http://files.cnblogs.com/mikewolf2002/gclTutorial.zip