Recently I am looking at opencl programs, but I am not very familiar with the working-item running mechanism. As a result, I took a look at it intuitively with a few small programs, mainly using OpenMP testing ideas to output work-item and the data processing results. I personally think this is very helpful for me to understand its operating mechanism. The following is a program:
Host Program: Main. cpp
/* Project: multiply the matrix of opencl by: Liu Rong time: 2012.11.20 */# include <iostream> # include <time. h> # include <string> # include <math. h> # include <vector> # include <CL/Cl. h> # include <fstream> using namespace STD; // kernel function STD: stringconverttostring (const char * filename) // convert the kernel source code, that is, the self-written parallel function, convert to string {size_t size; char * STR; STD: String s; STD: fstream F (filename, (STD: fstream: In | STD: fstream :: binary); If (F. is_open () {size _ T filesize; F. seekg (0, STD: fstream: End); size = filesize = (size_t) F. tellg (); F. seekg (0, STD: fstream: Beg); STR = new char [size + 1]; If (! Str) {f. close (); STD: cout <"memory allocation failed"; return NULL;} f. read (STR, filesize); F. close (); STR [size] = '\ 0'; S = STR; Delete [] STR; return s;} else {STD :: cout <"\ nfile containg the kernel code (\". cl \ ") not found. please copy the required file in the folder containg the executable. \ n "; exit (1);} return NULL;} int main () {Double Start, end, time1, time2; // query platform cl_int cierrnum; cl_platf Orm_id platform; cierrnum = clgetplatformids (1, & platform, null); If (cierrnum! = Cl_success) {cout <"failed to get the device" <Endl; return 0 ;}// obtain the device information cl_device_id device; cl_int status; cl_uint maxdims; cl_event events [3]; size_t globalthreads [1]; size_t localthreads [1]; size_t maxworkgroupsize; size_t maxworkitemsizes [3]; //////////////////////////////////////// //// // Step 7 analyzing proper workgroup size for the kernel/ /by querying device information // 7.1 Device Info cl_device_max_work_group_size // 7.2 device info cl_device_max_work_item_dimensions // 7.3 device Info packages ////////////////////////// //////////////////////////////////////// /// *** query device capabilities. maximum * Work Item dimensions and the maximmum * Work Item sizes */cierrnum = clgetdeviceids (platform, cl_device_type_all, 1, & device, null); status = clgetde Viceinfo (device, cl_device_max_work_group_size, sizeof (size_t), (void *) & maxworkgroupsize, null); If (status! = Cl_success) {STD: cout <"error: Getting device info. (clgetdeviceinfo) \ n "; return 0;} status = clgetdeviceinfo (device, cl_device_max_work_item_dimensions, sizeof (cl_uint), (void *) & maxdims, null); If (status! = Cl_success) {STD: cout <"error: Getting device info. (clgetdeviceinfo) \ n "; return 0;} status = clgetdeviceinfo (device, identifier, sizeof (size_t) * maxdims, (void *) maxworkitemsizes, null); If (status! = Cl_success) {STD: cout <"error: Getting device info. (clgetdeviceinfo) \ n "; return 0 ;}cout <" maxworkitemsizes "<maxworkitemsizes <Endl; cout <" maxdims "<maxdims <Endl; cout <"maxworkgroupsize" <(INT) maxworkgroupsize <Endl; // create the context cl_context_properties CPS [3] = {cl_context_platform, (cl_context_properties) platform, 0 }; cl_context CTX = clcreatecontext (CPS, 1, & device, null, null, & cierrnum); If (cier Rnum! = Cl_success) {cout <"failed to create context" <Endl; return 0;} cl_command_queue myqueue = clcreatecommandqueue (CTX, device, 0, & cierrnum); If (cierrnum! = Cl_success) {cout <"command queue failed" <Endl; return 0 ;}// declare the buffer and transmit data float * c = NULL; // output array float * B = NULL; // output array int c = 10; size_t datasize = sizeof (float) * C; // allocate memory space c = (float *) malloc (datasize); B = (float *) malloc (datasize); // initialize the input array cl_mem bufferc = clcreatebuffer (CTX, cl_mem_write_only, C * sizeof (float), null, & cierrnum); cl_mem bufferb = clcreatebuffer (CTX, cl_mem_write_only, C * sizeof (float), null, & cierrnum );/ /Run the kernel to compile const char * filename = "simplemultiply. cl "; STD: String sourcestr = converttostring (filename); const char * Source = sourcestr. c_str (); size_t sourcesize [] = {strlen (source)}; // read the CL file directly to the memory cl_program myprog = clcreateprogramwithsource (CTX, 1, & source, sourcesize, & cierrnum); // cl_program myprog = clcreateprogramwithsource (CTX, 1, (const char **) & programsource, null, & cierrnum); If (cierrn Um! = 0) {cout <"createprogram failed" <Endl;} cierrnum = clbuildprogram (myprog, 0, null, null); If (cierrnum! = 0) {cout <"clbuildprogram failed" <Endl;} cl_kernel mykernel = clcreatekernel (myprog, "vecadd", & cierrnum); If (cierrnum! = 0) {cout <"clcreatekernel failed" <Endl;} // run the program clsetkernelarg (mykernel, 0, sizeof (cl_mem), (void *) & bufferb ); clsetkernelarg (mykernel, 1, sizeof (cl_mem), (void *) & bufferc); size_t globalworksize [1]; globalworksize [0] = C/2; /// start = clock (); cierrnum = clenqueuendrangekernel (myqueue, mykernel, 1, null, globalworksize, null, 0, null, & events [0]); if (cierrnum! = 0) {cout <"clenqueuendrangekernel failed" <Endl;} // time synchronization status = clwaitforevents (1, & events [0]); If (status! = Cl_success) {STD: cout <"error: Waiting For kernel run to finish. \ (clwaitforevents0) \ n "; return 0 ;}cout <" O "<Endl; status = clreleaseevent (events [0]); // copy the result to the host end = clock (); time1 = end-start; cout <"Shijian" <time1 <Endl; cierrnum = clenqueuereadbuffer (myqueue, bufferc, cl_true, 0, datasize, C, 0, null, & events [1]); status = clwaitforevents (1, & events [1]); If (status! = Cl_success) {STD: cout <"error: Waiting For read buffer call to finish. \ (clwaitforevents1) N "; return 0;} status = clreleaseevent (events [1]); If (status! = Cl_success) {STD: cout <"error: Release event object. \ (clreleaseevent) \ n "; return 0;} cierrnum = clenqueuereadbuffer (myqueue, bufferb, cl_true, 0, datasize, B, 0, null, & events [2]); status = clwaitforevents (1, & events [2]); If (status! = Cl_success) {STD: cout <"error: Waiting For read buffer call to finish. \ (clwaitforevents1) N "; return 0;} status = clreleaseevent (events [2]); If (status! = Cl_success) {STD: cout <"error: Release event object. \ (clreleaseevent) \ n "; return 0 ;}// for (INT I = 0; I <C/2; I ++) {cout <" work-item: "<B [I] <": "; for (Int J = 0; j <2; j ++) {cout <C [I + J] <";}cout <Endl;} return 0 ;}
Kernel Function simplemultiply. cl
// Enter your kernel in this window__kernel void vecadd(__global float* B,__global float* C) { int id = get_global_id(0); // barrier(CLK_LOCAL_MEM_FENCE); B[id] = id; for(int i =0;i<2;i++) { C[id*2+i] = i; } // barrier(CLK_LOCAL_MEM_FENCE); };
Running result:
From the above results, we can see that each work-item runs independently,