Since Apple officially submitted its opencl to the khronos group Open Standards Organization in, it has received support from major companies such as AMD, NVIDIA, and Intel. Opencl can make full use of GPU data-intensive large-scale computing capabilities, so that many multimedia applications and even scientific computing can greatly improve performance.
Here we will mainly introduce how to use opencl in amd app SDK in Windows 7.
First, we can go to the amd developer website --development.amd.com to renew the app SDK. If you are using a Windows 7 operating system, after installation, the installation package automatically adds amd1_dkroot to the system environment variable, we will use this environment variable to find the path of the included header file and the path of the connected database.
Then we have to have visual stduio 2012 express edition or Professional Edition. You can create a Win32 console project or Win32 Application project, find C/C ++ in <Project Name> properties in the menu bar project, and then select genral, in additional include directories, enter $ (amd1_dkroot) \ include.
Then, locate Preprocessor and add macro _ crt_secure_no_warnings to Preprocessor definitions. This macro will be useful for reading files in the code later.
Next, find linker, click additional library directories, and add $ (amd1_dkroot) \ Lib \ x86.
Then, click input and add opencl. lib to additional dependencies.
In this way, we have completed all the preparations.
Next, we can write the opencl kernel code required in this example:
__kernel void MyCLAdd(__global int *dst, __global int *src1, __global int *src2){ int index = get_global_id(0); dst[index] = src1[index] + src2[index];}
We save the above Code as cl_kernel.cl and place it in the resource file and source code folder of this project. In the VC ++ project, we can add this file to the resource filter.
Next, we can write the main function or other functions to create and run the opencl kernel code.
#include <CL/cl.h>#include <stdio.h>#include <iostream>using namespace std;int main(void){ cl_uint numPlatforms = 0; //the NO. of platforms cl_platform_id platform = nullptr; //the chosen platform cl_context context = nullptr; // OpenCL context cl_command_queue commandQueue = nullptr; cl_program program = nullptr; // OpenCL kernel program object that'll be running on the compute device cl_mem input1MemObj = nullptr; // input1 memory object for input argument 1 cl_mem input2MemObj = nullptr; // input2 memory object for input argument 2 cl_mem outputMemObj = nullptr; // output memory object for output cl_kernel kernel = nullptr; // kernel object cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { cout<<"Error: Getting platforms!"<<endl; return 0; } /*For clarity, choose the first available platform. */ if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); platform = platforms[0]; free(platforms); } else { puts("Your system does not have any OpenCL platform!"); return 0; } /*Step 2:Query the platform and choose the first GPU device if has one.Otherwise use the CPU as device.*/ cl_uint numDevices = 0; cl_device_id *devices; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (numDevices == 0) //no GPU available. { cout << "No GPU device available."<<endl; cout << "Choose CPU as default device."<<endl; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL); } else { devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); cout << "The number of devices: " << numDevices << endl; } /*Step 3: Create context.*/ context = clCreateContext(NULL,1, devices,NULL,NULL,NULL); /*Step 4: Creating command queue associate with the context.*/ commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); /*Step 5: Create program object */ // Read the kernel code to the buffer FILE *fp = fopen("cl_kernel.cl", "rb"); if(fp == nullptr) { puts("The kernel file not found!"); goto RELEASE_RESOURCES; } fseek(fp, 0, SEEK_END); size_t kernelLength = ftell(fp); fseek(fp, 0, SEEK_SET); char *kernelCodeBuffer = (char*)malloc(kernelLength + 1); fread(kernelCodeBuffer, 1, kernelLength, fp); kernelCodeBuffer[kernelLength] = '\0'; fclose(fp); const char *aSource = kernelCodeBuffer; program = clCreateProgramWithSource(context, 1, &aSource, &kernelLength, NULL); /*Step 6: Build program. */ status = clBuildProgram(program, 1,devices,NULL,NULL,NULL); /*Step 7: Initial inputs and output for the host and create memory objects for the kernel*/ int __declspec(align(32)) input1Buffer[128]; // 32 bytes alignment to improve data copy int __declspec(align(32)) input2Buffer[128]; int __declspec(align(32)) outputBuffer[128]; // Do initialization int i; for(i = 0; i < 128; i++) input1Buffer[i] = input2Buffer[i] = i + 1; memset(outputBuffer, 0, sizeof(outputBuffer)); // Create mmory object input1MemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, 128 * sizeof(int), input1Buffer, nullptr); input2MemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, 128 * sizeof(int), input2Buffer, nullptr); outputMemObj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 128 * sizeof(int), NULL, NULL); /*Step 8: Create kernel object */ kernel = clCreateKernel(program,"MyCLAdd", NULL); /*Step 9: Sets Kernel arguments.*/ status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outputMemObj); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&input1MemObj); status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&input2MemObj); /*Step 10: Running the kernel.*/ size_t global_work_size[1] = { 128 }; status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL); clFinish(commandQueue); // Force wait until the OpenCL kernel is completed /*Step 11: Read the cout put back to host memory.*/ status = clEnqueueReadBuffer(commandQueue, outputMemObj, CL_TRUE, 0, global_work_size[0] * sizeof(int), outputBuffer, 0, NULL, NULL); printf("Veryfy the rsults... "); for(i = 0; i < 128; i++) { if(outputBuffer[i] != (i + 1) * 2) { puts("Results not correct!"); break; } } if(i == 128) puts("Correct!");RELEASE_RESOURCES: /*Step 12: Clean the resources.*/ status = clReleaseKernel(kernel);//*Release kernel. status = clReleaseProgram(program); //Release the program object. status = clReleaseMemObject(input1MemObj);//Release mem object. status = clReleaseMemObject(input2MemObj); status = clReleaseMemObject(outputMemObj); status = clReleaseCommandQueue(commandQueue);//Release Command queue. status = clReleaseContext(context);//Release context. free(devices);}
We can compile and run it directly.
In the validation function, we can find that the output result is completely correct.
Note that the source file must be saved as the. cpp suffix and must use a VC compiler that supports the C ++ 11 standard, such as vs2012. Of course, if vs2010 is used, it can also be compiled, even though I have not tried it.