Cuda is very handy for parallel computing, but the interaction between the GPU and the CPU, such as passing parameters, is relatively cumbersome. When writing the Cuda kernel function, there will often be many parameters, to reach 10-20, if the data can be organized in advance of the CPU, such as the use of two-dimensional arrays, which can save a lot of arguments, in the kernel function can use a two-dimensional array to extract data to simplify the code structure. Of course, the use of two-dimensional data will increase the number of GPU memory access, inevitably affect efficiency, this is not the focus of today's discussion.
Give two code Li Zilai to illustrate the use of two-dimensional arrays in Cuda (pro-Test available):
1. Examples of common two-dimensional arrays:
Input: Two-dimensional array a (8 Rows 4 columns)
Output: Two-dimensional array C (8 rows 4 columns)
function function: Add 10 to each element in array A and save it to the corresponding position in C.
This is a simple example of accessing data in a two-dimensional array with a first-level pointer and a level two pointer, with the following main steps:
(1) Allocate CPU memory for level two pointers dataa, C, and one-level pointers, and DATAC. The second-level pointer points to the memory where the address of the first-level pointer is saved. The first-level pointers point to the memory in which the input and output data is saved.
(2) on the device side (GPU) also established two-level pointer d_a, D_c and first-level pointer d_dataa, D_DATAC, and allocate GPU memory, the same principle, but point to the memory is the memory in the GPU.
(3) The input data is saved to a two-dimensional array in the CPU via the host-side-level pointer dataa.
(4) Key step: The address of the first-level pointer to the device , to the CPU memory that the host-side level two pointer points to.
(5) Key step: Use the cudamemcpy () function to copy the data from the host-side level two pointer (the address of the device-side pointer) to the GPU memory pointed to by the device-side level two pointer. This allows a level two pointer to be used on the device side to access the address of a primary pointer and then access the input data using a first-level pointer. The use of a[][], c[][].
(6) Use the cudamemcpy () function to copy the input data from the CPU memory space of the host-level pointer to the GPU memory in the device-side pointer, so that the input data is uploaded to the device side.
(7) In the kernel function Addkernel (), a two-dimensional array method can be used to read, operate and write data.
(8) Finally, the device-side pointer to the GPU memory of the output data is copied to the host side pointer to the CPU memory, print display.
#include <cuda_runtime.h>#include<device_launch_parameters.h>#include<opencv2\opencv.hpp>#include<iostream>#include<string>using namespaceCV;using namespacestd;#defineRow 8#defineCol 4__global__voidAddkernel (int**c,int**A) {intIDX = threadidx.x + blockdim.x *blockidx.x; intIdy = threadidx.y + blockdim.y *blockidx.y; if(IDX < Col && Idy <Row) {C[idy][idx]= A[idy][idx] +Ten; } } intMain () {int**a = (int**)malloc(sizeof(int*) *Row); int**c = (int**)malloc(sizeof(int*) *Row); int*dataa = (int*)malloc(sizeof(int) * Row *Col); int*datac = (int*)malloc(sizeof(int) * Row *Col); int**d_a; int**D_c; int*D_dataa; int*D_datac; //malloc Device MemoryCudamalloc ((void* *) &d_a,sizeof(int**) *Row); Cudamalloc ((void* *) &d_c,sizeof(int**) *Row); Cudamalloc ((void* *) &d_dataa,sizeof(int) *row*Col); Cudamalloc ((void* *) &d_datac,sizeof(int) *row*Col); //Set Value for(inti =0; i < Row*col; i++) {Dataa[i]= i+1; } //point the host pointer A to the device data location to enable the device level two pointer to point to the device data level pointer//both A and dataa have been uploaded to the device, but the two have not yet established a corresponding relationship. for(inti =0; i < Row; i++) {A[i]= D_dataa + Col *i; C[i]= D_datac + Col *i; } cudamemcpy (D_a, A,sizeof(int*) *Row, Cudamemcpyhosttodevice); cudamemcpy (D_c, C,sizeof(int*) *Row, Cudamemcpyhosttodevice); cudamemcpy (D_dataa, Dataa,sizeof(int) * Row *Col, Cudamemcpyhosttodevice); DIM3 Block (4,4); Dim3 Grid (Col+ block.x-1)/block.x, (Row + block.y-1) /block.y); Addkernel<< <grid, block >> >(D_c, d_a); //copy calculation data-first level data pointercudamemcpy (Datac, D_datac,sizeof(int) * Row *Col, Cudamemcpydevicetohost); for(inti =0; i < Row*col; i++) { if(I%col = =0) {printf ("\ n"); } printf ("%5d", Datac[i]); } printf ("\ n"); }
Example of a mat array in 2.OpenCV
Input: Image lena.jpg
Output: Image moon.jpg
function function: Two images weighted and
The principle is the same as above, the process difference is that the input of the two-dimensional data is the following two image data, and then in Cuda weighted summation.
The effect is as follows:
The code is here for reference.
#include <cuda_runtime.h>#include<device_launch_parameters.h>#include<opencv2\opencv.hpp>#include<iostream>#include<string>using namespaceCV;using namespacestd; __global__voidAddkernel (Uchar **psrcimg, uchar* pdstimg,intIMGW,intIMGH) { intTidx = threadidx.x + blockdim.x *blockidx.x; intTidy = threadidx.y + blockdim.y *blockidx.y; if(TIDX<IMGW && tidy<IMGH) { intidx=tidy*imgw+Tidx; Uchar Lenavalue=psrcimg[0][idx]; Uchar Moonvalue=psrcimg[1][idx]; PDSTIMG[IDX]= Uchar (0.5*lenavalue+0.5*moonvalue); }} intMain () {//OpenCV reading two imagesMat img[2]; img[0]=imread ("data/lena.jpg",0); img[1]=imread ("data/moon.jpg",0); intimgh=img[0].rows; intimgw=img[0].cols; //Output ImageMat dstimg=Mat::zeros (IMGH, IMGW, CV_8UC1); //Host PointersUchar **pimg= (uchar**)malloc(sizeof(uchar*) *2);//Enter level two pointer//Device PointerUchar **pdevice;//Enter level two pointerUchar *pdevicedata;//Enter a first-level pointerUchar *pdstimgdata;//output image corresponding to device pointer//Allocating GPU MemoryCudaerror err; //target output image allocates GPU memoryErr=cudamalloc (&pdstimgdata, imgw*imgh*sizeof(Uchar)); //device level Two pointer allocates GPU memoryErr=cudamalloc (&pdevice,sizeof(uchar*) *2); //device-level pointers allocate GPU memoryErr=cudamalloc (&pdevicedata,sizeof(UCHAR) *imgh*imgw*2); //key: The host level two pointer points to the device first-level pointer position so that the device's level two pointer points to the device's first-level pointer position for(intI=0; i<2; i++) {Pimg[i]=pdevicedata+i*imgw*IMGH; } //copying data to the GPU//Copy the elements from the host level two pointer to the GPU location that the device level two pointer points to (the element in this level two pointer is the address of the first-level pointer in the device)err=cudamemcpy (pdevice, PIMG,sizeof(uchar*) *2, Cudamemcpyhosttodevice); //Copy Image data (host-level pointers point to host memory) to the GPU memory pointed to by the device-level pointererr=cudamemcpy (Pdevicedata, img[0].data,sizeof(UCHAR) *imgh*IMGW, Cudamemcpyhosttodevice); Err=cudamemcpy (PDEVICEDATA+IMGH*IMGW, img[1].data,sizeof(UCHAR) *imgh*IMGW, Cudamemcpyhosttodevice); //the kernel function realizes the simple weighting of Lena graph and moon graph andDIM3 Block (8,8); Dim3 Grid (IMGW+block.x-1)/block.x, (imgh+block.y-1)/block.y); Addkernel<<<grid, block>>>(pdevice, Pdstimgdata, IMGW, IMGH); Cudathreadsynchronize (); //Copy the output image data to the host and write to the localerr=cudamemcpy (Dstimg.data, Pdstimgdata, imgw*imgh*sizeof(Uchar), cudamemcpydevicetohost); Imwrite ("data/synthsis.jpg", dstimg);}
OpenCV the use of two-dimensional mat arrays (level two pointers) in Cuda