Next is Multigpu and Openglinterop, but the two projects skip first. Because I have only one video card in My computer, and then I can't use it with OpenGL.
So the next step is the OpenCL scan project, and at first glance it feels very different from what it was before, and it's hard to look like.
CL File:
Scan codelets////////////////////////////////////////////////////////////////////////////////#if (1)//Naive Inclusive Scan:o (n * log2 (n)) Operations//allocate 2 * ' size ' local memory, initialize the first half//with ' Si Ze ' zeros Avoiding if (pos >= offset) condition evaluation//and Saving Instructions inline uint scan1inclusive (
UINT Idata, __local uint *l_data, uint size) {UINT POS = 2 * get_local_id (0)-(get_local_id (0) & (size-1));
L_data[pos] = 0;
pos + = size;
L_data[pos] = idata;
for (UINT offset = 1; offset < size; offset <<= 1) {barrier (clk_local_mem_fence);
UINT T = L_data[pos] + L_data[pos-offset];
Barrier (clk_local_mem_fence);
L_data[pos] = t;
} return L_data[pos]; } inline UINT scan1exclusive (UINT idata, __local uint *l_data, uint size) {return scan1inclusive (idata, L_dat
A, size)-idata; } #else #dEfine log2_warp_size 5U #define WARP_SIZE (1U << log2_warp_size)//almost the same as naive scan1inclu sive but doesn ' t need barriers//and works only for size <= warp_size inline uint warpscaninclusive (UINT idata,
Volatile __local uint *l_data, uint size) {UINT POS = 2 * get_local_id (0)-(get_local_id (0) & (size-1));
L_data[pos] = 0;
pos + = size;
L_data[pos] = idata;
if (size >= 2) l_data[pos] + = l_data[pos-1];
if (size >= 4) l_data[pos] + = l_data[pos-2];
if (size >= 8) L_data[pos] + = l_data[pos-4];
if (size >=) L_data[pos] + = l_data[pos-8];
if (size >=) L_data[pos] + = l_data[pos-16];
return L_data[pos]; } inline UINT warpscanexclusive (UINT idata, __local uint *l_data, uint size) {return warpscaninclusive (idata,
L_data, size)-idata;
} inline UINT scan1inclusive (UINT idata, __local uint *l_data, uint size) { if (Size > warp_size) {//bottom-level inclusive WARP scan UINT Warpresult = WARPSCANINCLU
Sive (Idata, L_data, warp_size); Save top elements of each warp for exclusive warp scan//sync to wait for warp scans to complete (because L_
Data is being overwritten) barrier (clk_local_mem_fence); if ((get_local_id (0) & (warp_size-1) = = (warp_size-1)) l_data[get_local_id (0) >> Log2_warp
_size] = Warpresult;
Wait for warp scans to complete barrier (clk_local_mem_fence); if (get_local_id (0) < (workgroup_size/warp_size)) {//grab top WARP elements uint Val
= l_data[get_local_id (0)]; Calculate exclsive scan and write back to shared memory l_data[get_local_id (0)] = warpscanexclusive (Val,
L_data, size >> log2_warp_size); }//return updated warp scans witH Exclusive Scan results barrier (clk_local_mem_fence);
return Warpresult + l_data[get_local_id (0) >> log2_warp_size];
}else{return warpscaninclusive (idata, l_data, size); }} inline UINT scan1exclusive (UINT idata, __local uint *l_data, uint size) {return scan1inclusive (idata,
L_data, size)-idata; } #endif//vector scan:the array to being scanned is stored//in work-item private memory as Uint4 inline uint4 scan4inclu
Sive (uint4 data4, __local uint *l_data, uint size) {//level-0 inclusive scan data4.y + = data4.x;
Data4.z + = Data4.y;
DATA4.W + = data4.z;
LEVEL-1 Exclusive scan UINT val = scan1inclusive (DATA4.W, L_data, SIZE/4)-DATA4.W;
Return (data4 + (UINT4) val); } inline Uint4 scan4exclusive (uint4 data4, __local uint *l_data, uint size) {return scan4inclusive (data4, L_data, siz
e)-data4; }
////////////////////////////////////////////////////////////////////////////Scan Kernels////////////////////////////////////////////////////////////////////////////////__kernel __ ATTRIBUTE__ ((Reqd_work_group_size (Workgroup_size, 1, 1)) void ScanExclusiveLocal1 (__global uint4 *d_dst, __globa
L UINT4 *d_src, __local uint *l_data, uint size) {//load Data uint4 idata4 = d_src[get_global_id (0)];
Calculate Exclusive scan Uint4 odata4 = scan4exclusive (idata4, l_data, size);
Write back d_dst[get_global_id (0)] = odata4; }//exclusive scan of top elements of Bottom-level scans (4 * threadblock_size) __kernel __attribute__ ((reqd_work_group_s Ize (Workgroup_size, 1, 1))) void ScanExclusiveLocal2 (__global uint *d_buf, __global uint *D_DST, __global UIn T *d_src, __local uint *l_data, uint N, uint arraylength) {//load top elements//convert results of B Ottom-level scan back to inclusive//skip loads and stores for inactive Work-items of the work-group with highest Inde X (POS >=N) UINT data = 0;
if (get_global_id (0) < N) data = d_dst[(4 * workgroup_size-1) + (4 * workgroup_size) * GET_GLOBAL_ID (0)] +
d_src[(4 * workgroup_size-1) + (4 * workgroup_size) * GET_GLOBAL_ID (0)];
Compute UINT OData = scan1exclusive (data, L_data, arraylength);
Avoid out-of-bound Access if (get_global_id (0) < N) d_buf[get_global_id (0)] = OData; }//final step of Large-array scan:combine Basic inclusive scan with exclusive scan of top elements of input arrays __ke Rnel __attribute__ ((Reqd_work_group_size (Workgroup_size, 1, 1)) void Uniformupdate (__global uint4 *d_data, __glo
Bal UINT *d_buf) {__local uint buf[1];
Uint4 data4 = d_data[get_global_id (0)];
if (get_local_id (0) = = 0) buf[0] = d_buf[get_group_id (0)];
Barrier (clk_local_mem_fence);
Data4 + = (uint4) buf[0];
D_DATA[GET_GLOBAL_ID (0)] = data4; }
Main function Section:
int main (int argc, const char **argv) {Shrqastart (argc, (char * *) argv);
Start logs Shrsetlogfilename ("OclScan.txt");
Shrlog ("%s starting...\n\n", argv[0]); CL_PLATFORM_ID Cpplatform; OpenCL platform cl_device_id Cddevice; OpenCL device Cl_context Cxgpucontext; OpenCL context Cl_command_queue Cqcommandqueue; OpenCL command que cl_mem d_input, d_output;
OpenCL Memory buffer objects Cl_int cierrnum;
UINT *h_input, *H_OUTPUTCPU, *h_outputgpu;
const UINT N = 13 * 1048576/2;
Shrlog ("Allocating and initializing host arrays...\n");
H_input = (UINT *) malloc (N * sizeof (UINT));
H_OUTPUTCPU = (UINT *) malloc (N * sizeof (UINT));
H_OUTPUTGPU = (UINT *) malloc (N * sizeof (UINT));
Srand (2009);
for (UINT i = 0; i < N; i++) H_input[i] = rand ();
Shrlog ("Initializing opencl...\n"); Get the NVIDIA platform Cierrnum = OCLGETPLATFOrmid (&cpplatform);
Oclcheckerror (Cierrnum, cl_success);
Get a GPU device cierrnum = Clgetdeviceids (Cpplatform, Cl_device_type_gpu, 1, &cddevice, NULL);
Oclcheckerror (Cierrnum, cl_success);
Create the context Cxgpucontext = Clcreatecontext (0, 1, &cddevice, NULL, NULL, &cierrnum);
Oclcheckerror (Cierrnum, cl_success);
Create a Command-queue cqcommandqueue = Clcreatecommandqueue (cxgpucontext, Cddevice, 0, &cierrnum);
Oclcheckerror (Cierrnum, cl_success);
Shrlog ("Initializing OpenCL scan...\n");
Initscan (Cxgpucontext, Cqcommandqueue, argv);
Shrlog ("Creating OpenCL memory objects...\n\n"); D_input = Clcreatebuffer (Cxgpucontext, Cl_mem_read_write |
Cl_mem_copy_host_ptr, N * sizeof (UINT), h_input, &cierrnum);
Oclcheckerror (Cierrnum, cl_success); D_output = Clcreatebuffer (Cxgpucontext, Cl_mem_read_write, N * sizeof (UINT), NULL, &cierrnum);
Oclcheckerror (Cierrnum, cl_success); int globalflag = 1;
Init pass/fail flag to pass size_t Szworkgroup;
const int icycles = 100;
Shrlog ("* * * Running GPU scan for short arrays (%d identical iterations) ... \ n", icycles); for (UINT arraylength = min_short_array_size; arraylength <= max_short_array_size; arraylength *= 2) {ShrLo
G ("Running Scan for%u elements (%u arrays) ... \ n", Arraylength, N/arraylength);
ClFinish (Cqcommandqueue);
Shrdeltat (0); for (int i = 0; i<icycles; i++) {szworkgroup = Scanexclusiveshort (CqC
Ommandqueue, D_output, D_input, N/arraylength,
Arraylength);
} clFinish (Cqcommandqueue);
Double timervalue = shrdeltat (0)/(double) icycles;
Shrlog ("Validating the results...\n"); Shrlog ("... reading back OpenCL memory\n ");
Cierrnum = Clenqueuereadbuffer (Cqcommandqueue, D_output, cl_true, 0, N * sizeof (UINT), H_OUTPUTGPU, 0, NULL, NULL);
Oclcheckerror (Cierrnum, cl_success);
Shrlog ("... scanexclusivehost () \ n");
Scanexclusivehost (H_outputcpu, H_input, N/arraylength,
Arraylength); Compare GPU results with CPU results and accumulate error for this test Shrlog ("... comparing the results\n
");
int localflag = 1;
for (UINT i = 0; i < N; i++) {if (H_outputcpu[i]! = H_outputgpu[i])
{localflag = 0;
Break }}//Log message on individual test result, then accumulate to global flag SHRL OG ("... Results %s\n\n ", (Localflag = = 1)?
"Match": "DON ' T Match!!!");
GlobalFlag = GlobalFlag && localflag; #ifdef gpu_profiling if (arraylength = = max_short_array_size) {Shrlog
("\ n"); Shrlogex (Logboth | MASTER, 0, "oclscan-short, throughput =%.4f melements/s, time =%.5f s, Size =%u Elements, numdevsused =%u, Workgroup =
%u\n ", (1.0e-6 * (double) arraylength/timervalue), Timervalue, Arraylength, 1, szworkgroup);
Shrlog ("\ n");
} #endif} shrlog ("* * * Running GPU scan for large arrays (%d identical iterations) ... \ n", icycles);
for (UINT arraylength = min_large_array_size; arraylength <= max_large_array_size; arraylength *= 2) {
Shrlog ("Running Scan for%u elements (%u arrays) ... \ n", Arraylength, N/arraylength);
ClFinish (Cqcommandqueue);
Shrdeltat (0); FoR (int i = 0; i<icycles; i++) {szworkgroup = Scanexclusivelarge (cqcom
Mandqueue, D_output, D_input, N/arraylength,
Arraylength);
} clFinish (Cqcommandqueue);
Double timervalue = shrdeltat (0)/(double) icycles;
Shrlog ("Validating the results...\n");
Shrlog ("... reading back OpenCL memory\n");
Cierrnum = Clenqueuereadbuffer (Cqcommandqueue, D_output, cl_true, 0, N * sizeof (UINT), H_OUTPUTGPU, 0, NULL, NULL);
Oclcheckerror (Cierrnum, cl_success);
Shrlog ("... scanexclusivehost () \ n");
Scanexclusivehost (H_outputcpu, H_input, N/arraylength,
Arraylength); Compare GPU results with CPU results and accumulate error for this Test Shrlog ("... comparing the results\n");
int localflag = 1;
for (UINT i = 0; i < N; i++) {if (H_outputcpu[i]! = H_outputgpu[i])
{localflag = 0;
Break }}//Log message on individual test result, then accumulate to global flag SHRL OG ("... Results%s\n\n ", (Localflag = = 1)?
"Match": "DON ' T Match!!!");
GlobalFlag = GlobalFlag && localflag; #ifdef gpu_profiling if (arraylength = = max_large_array_size) {Shrlog
("\ n"); Shrlogex (Logboth | MASTER, 0, "oclscan-large, throughput =%.4f melements/s, time =%.5f s, Size =%u Elements, numdevsused =%u, Workgroup = %u\n ", (1.0e-6 * (double) arraylength/timervalue), Timervalue, Arraylength, 1, Szworkgroup);
Shrlog ("\ n");
} #endif} shrlog ("shutting down...\n");
Release Kernels and program Closescan ();
Release other OpenCL Objects cierrnum = Clreleasememobject (d_output);
Cierrnum |= Clreleasememobject (d_input);
Cierrnum |= Clreleasecommandqueue (cqcommandqueue);
Cierrnum |= Clreleasecontext (cxgpucontext);
Oclcheckerror (Cierrnum, cl_success);
Release Host buffers free (H_OUTPUTGPU);
Free (H_OUTPUTCPU);
Free (h_input); Finish//pass or FAIL (cumulative tests in the Loop) Shrqafinishexit (argc, (const char * *) argv, globalf A lag?
qa_passed:qa_failed);
Finish shrexit (argc, argv); }
In fact, this is the meaning, SRC is a n=6815744 size array, also create a large array, each value in the array is the same as all previous position values: That's what it means.
However, there is a problem, the host argument h_input variable is n size UINT type when creating buffer is also n size UINT variable name is d_input to pass this d_input to kernel function scanExclusiveLocal1, and kernel that positional parameter is the global uint4* D_SRC This example can be passed directly. UINT directly to the uint4, not even the strong conversion of ... I thought it was just passing in to uint4.x. The other uint4.y, Uint4.z, and UINT4.W are random, random, meaningless values. The result is not this .... There is a buffer of 100 uint elements (because int/uint, as in the A-card implementation, and the normal int/uint size, so do not distinguish cl_ prefixes such as cl_int) you can use the UINT *p1 point to it, then nature still has 100 valid elements , i.e. p1[0]-p1[99]. This is also possible if you want to use Uint4 *p2 to point to it. But there are only 25 valid elements, namely p2[0]-p2[24]. Invalid only from p2[25]-p2[99]. Instead of my imagination p2[0]-p2[99], only the X element is valid, the y/z/w element is invalid ... The great God pointed out that this is my nth time to fell on this memory issue, and last time and last time there were 2 examples of this.
I'm now following the main function step-by-step analysis: arraylength=4, localsize=256,globalsize=n/4=1703936, executes the 1th kernel, GlobalSize a thread to call the Scan4exclusive () function, because the uint is passed to uint4, so there are actually n numbers in total. and call the Scan4inclusive () function, in fact, each globalid is responsible for the original 4 uint, function scan4inclusive () will add these 4 numbers, in fact, for these 4 numbers to achieve each number is the sum of the preceding number. Then the function scan4inclusive () calls the Scan1inclusive () function, and I parse into the scan1inclusive () function and get stuck. Scan1inclusive () function is used to localid, because Localsize is 256 size, and here l_data[pos]=0; pos+=size; (arraylength=4 size=1) L_data[pos]=idata, so in fact, the equivalent of each localid 2 positions of the number, 256 localid*2=512, so L_ The data size of 512 is no problem. It's just that the L_data Ridge is placed 0, and the number of bits placed is the sum of the first 4 bits of the original data uint, the sum of 5-8 bits, and the sum of 9-12 bits 、、、、 I fainted 、、、