transferred from: http://www.cnblogs.com/mikewolf2002/archive/2012/09/05/2671261.html
Author: Mike Old Wolf
Now, we begin to write a simple OpenCL program that calculates the sum of two arrays and puts them in another array. The program calculates the CPU and GPU separately, and finally verifies that they are equal. The process of the OPENCL program is roughly as follows:
The following are the main codes in source code:
int main (int argc, char* argv[])
{
Create three buffers in host memory
float *buf1 = 0;
float *buf2 = 0;
float *buf = 0;
BUF1 = (float *) malloc (BUFSIZE * sizeof (float));
BUF2 = (float *) malloc (BUFSIZE * sizeof (float));
BUF = (float *) malloc (BUFSIZE * sizeof (float));
Initialize the contents of BUF1 and buf2 with some random values
int i;
Srand ((unsigned) time (NULL));
for (i = 0; i < BUFSIZE; i++)
Buf1[i] = rand ()%65535;
Srand ((unsigned) time (NULL) +1000);
for (i = 0; i < BUFSIZE; i++)
Buf2[i] = rand ()%65535;
CPU calculates the BUF1,BUF2 and
for (i = 0; i < BUFSIZE; i++)
Buf[i] = Buf1[i] + buf2[i];
Cl_uint status;
CL_PLATFORM_ID platform;
Creating Platform Objects
Status = Clgetplatformids (1, &platform, NULL);
Note: If we have more than one OpenCL platform installed in our system, such as my OS, there are Intel and AMD two OPENCL platforms, with this line of code, there may be errors, Because it gets Intel's OpenCL platform, and Intel's platform only supports CPUs, and we're behind GPU-based, we can use the following code to get AMD's OpenCL platform.
cl_uint numplatforms;
Std::string Platformvendor;
Status = Clgetplatformids (0, NULL, &numplatforms);
if (Status! = cl_success) {return 0;
} if (0 < numplatforms) {cl_platform_id* platforms = new Cl_platform_id[numplatforms];
Status = Clgetplatformids (Numplatforms, platforms, NULL);
Char platformname[100];
for (unsigned i = 0; i < numplatforms; ++i) {status = Clgetplatforminfo (Platforms[i],
Cl_platform_vendor, sizeof (PlatformName), PlatformName, NULL);
platform = Platforms[i];
Platformvendor.assign (PlatformName);
if (!strcmp (PlatformName, "Advanced Micro Devices, Inc."))
{break;
}} std::cout << "Platform found:" << platformname << "\ n";
Delete[] platforms; }
CL_DEVICE_ID device;
Creating a GPU Device
Clgetdeviceids (Platform, Cl_device_type_gpu,
1,
&device,
NULL);
Create a context
Cl_context context = Clcreatecontext (NULL,
1,
&device,
NULL, NULL, NULL);
To create a command queue
Cl_command_queue Queue = Clcreatecommandqueue (context,
Device
Cl_queue_profiling_enable, NULL);
Create three OpenCL memory objects and put the contents of the BUF1 in an implicitly-copied manner
BUF1 content copied to Clbuf1,buf2 is copied to clbuf2 by means of a copy display
Cl_mem clbuf1 = Clcreatebuffer (Context,
cl_mem_read_only | Cl_mem_copy_host_ptr,
Bufsize*sizeof (cl_float), BUF1,
NULL);
Cl_mem clbuf2 = Clcreatebuffer (Context,
Cl_mem_read_only,
Bufsize*sizeof (cl_float), NULL,
NULL);
Cl_event writeevt;
Status = Clenqueuewritebuffer (Queue, CLBUF2, 1,
0, Bufsize*sizeof (cl_float), buf2, 0, 0, 0);
The above line of code to copy the contents of Buf2 to Clbuf2, because Buf2 is located on the host side, Clbuf2 is located on the device side, so this function will perform a host to device transfer operation, or once the system memory to video Memory copy operation, so I placed the Clfush function behind the function to commit all the commands in the command queue to device (note: The command does not guarantee that command execution is complete), So we call the function waitforeventandrelease to wait for the write buffer to complete, Waitforeventandreleae is a user-defined function, its content is as follows, the main code is through the event to find out whether our operation is completed, If not done, the program has been block in this line of code, in addition we can also use the built-in OpenCL function clwaitforevents to replace Clflush and Waitforeventandreleae.
Wait for event completion
int waitforeventandrelease (cl_event *event)
{
Cl_int status = cl_success;
Cl_int eventstatus = cl_queued;
while (eventstatus! = cl_complete)
{
status = Clgeteventinfo (
*event,
cl_event_command_execution_ STATUS,
sizeof (cl_int),
&eventstatus,
NULL);
}
Status = Clreleaseevent (*event);
return 0;
}
Status = Clflush (queue);
Wait for data transfer to complete before proceeding to execution
Waitforeventandrelease (&WRITEEVT);
Cl_mem buffer = Clcreatebuffer (context,
Cl_mem_write_only,
BUFSIZE * sizeof (cl_float),
NULL, NULL);
The kernel file is placed in the GPU execution code, it is placed in a separate file add.cl, the kernel code in this program is very simple, just perform two arrays Add. The code for kernel is:
__kernel void Vecadd (__global const float* A, __global const float* B, __global float* C)
{
int id = get_global_id (0);
C[id] = A[id] + b[id];
}
Kernel files are add.cl
const char * filename = "add.cl";
Std::string sourcestr;
Status = ConvertToString (filename, sourcestr);
ConvertToString is also a user-defined function that reads the kernel source file into a string with the following code:
Reads a text file into a string to read into the kernel source file
int converttostring (const char *filename, std::string& s)
{
size_t size;
char* str;
Std::fstream f (filename, (std::fstream::in | std::fstream::binary));
if (F.is_open ())
{
size_t fileSize;
F.SEEKG (0, std::fstream::end);
Size = FileSize = (size_t) f.tellg ();
F.SEEKG (0, Std::fstream::beg);
str = new Char[size+1];
if (!str)
{
f.close ();
return NULL;
}
F.read (str, fileSize);
F.close ();
Str[size] = ' + ';
s = str;
delete[] STR;
return 0;
}
printf ("error:failed to open File%s\n", filename);
return 1;
}
const char * source = SOURCESTR.C_STR ();
size_t sourcesize[] = {strlen (source)};
To create a program object
Cl_program program = Clcreateprogramwithsource (
Context
1,
&source,
Sourcesize,
NULL);
Compiling program objects
Status = Clbuildprogram (program, 1, &device, NULL, NULL, NULL);
if (Status! = 0)
{
printf ("Clbuild failed:%d\n", status);
Char tbuf[0x10000];
Clgetprogrambuildinfo (program, device, Cl_program_build_log, 0x10000, Tbuf, NULL);
printf ("\n%s\n", tbuf);
return-1;
}
Create a Kernel object
Cl_kernel kernel = Clcreatekernel (program, "Vecadd", NULL);
Setting the kernel parameter
Cl_int clnum = BUFSIZE;
Clsetkernelarg (kernel, 0, sizeof (CL_MEM), (void*) &clbuf1);
Clsetkernelarg (kernel, 1, sizeof (CL_MEM), (void*) &clbuf2);
Clsetkernelarg (kernel, 2, sizeof (CL_MEM), (void*) &buffer);
Note: In the execution of kernel, we only set the global work items quantity, the group size is not set, this time, the system will use the default working group size, usually 256 or the like.
Perform kernel,range with 1 dimensions, work itmes size of bufsize
Cl_event ev;
size_t global_work_size = BUFSIZE;
Clenqueuendrangekernel (Queue,
Kernel
1,
Null
&global_work_size,
NULL, 0, NULL, &EV);
Status = Clflush (queue);
Waitforeventandrelease (&ev);
Data is copied back to host memory
Cl_float *ptr;
Cl_event mapevt;
PTR = (cl_float *) Clenqueuemapbuffer (queue,
Buffer
Cl_true,
Cl_map_read,
0,
BUFSIZE * sizeof (cl_float),
0, NULL, NULL, NULL);
Status = Clflush (queue);
Waitforeventandrelease (&MAPEVT);
Result validation, comparison of results with CPU calculations
if (!memcmp (buf, PTR, BUFSIZE))
printf ("Verify passed\n");
else printf ("Verify failed");
if (BUF)
Free (BUF);
if (BUF1)
Free (BUF1);
if (BUF2)
Free (BUF2);
After the program is finished, these OpenCL objects will generally be released automatically, but in order to complete the program, to develop a good habit, here I added the code to manually release the OpenCL object.
Delete an OpenCL resource object
Clreleasememobject (CLBUF1);
Clreleasememobject (CLBUF2);
Clreleasememobject (buffer);
Clreleaseprogram (program);
Clreleasecommandqueue (queue);
Clreleasecontext (context);
return 0;
}
After the execution of the program interface is as follows:
For complete code, please refer to:
Project Document GCLTUTORIAL1
Code Download:
Http://files.cnblogs.com/mikewolf2002/gclTutorial.zip