OpenCV read the picture and pass the picture data to Cuda processing
#include <iostream> #include <opencv2/core/core.hpp> #include <opencv2/highgui/highgui.hpp> #
Include<opencv2/imgproc/imgproc.hpp> #include <stdio.h> using namespace std;
using namespace CV; #define NUM_BLOCK//number of thread blocks #define Num_thread __global__ void Hello (Uchar *a, Uchar *b
, int bins,int nthreads, int nblocks) {int i; int idx = blockidx.x*blockdim.x+threadidx.x;
Sequential thread index across the blocks for (I=IDX; i<bins; i+=nthreads*nblocks) {A[IDX]+=B[IDX];
if (a[idx]>255) a[idx]=255;
if (a[idx]<0) a[idx]=0;
}} int main () {iplimage* img1=cvloadimage ("Test1.jpg", 0);
iplimage* img2=cvloadimage ("Test2.jpg", 0);
uchar* a= (uchar*) img1->imagedata;
uchar* b= (uchar*) img2->imagedata;
int n=img1->height*img1->widthstep;
Uchar *ad;
Uchar *BD;
const int csize = n*sizeof (Uchar);
const int isize = n*sizeof (Uchar);Cudamalloc ((void**) &ad, csize);
Cudamalloc ((void**) &bd, isize);
cudamemcpy (AD, A, csize, cudamemcpyhosttodevice);
CUDAMEMCPY (BD, B, Isize, cudamemcpyhosttodevice); DIM3 Dimgrid (num_block,1,1); Grid Dimensions dim3 dimblock (num_thread,1,1);
Block Dimensions Hello<<<dimgrid, dimblock>>> (AD, Bd,n,num_thread, Num_block);
cudamemcpy (b, AD, CSize, cudamemcpydevicetohost);
Cudafree (AD);
Cudafree (BD);
Cvnamedwindow ("Image display", cv_window_autosize);
Cvshowimage ("Image display", IMG2);
Cvwaitkey (0);
return 0; }Reference code: Calculate PI
#include <stdio.h> #include <windows.h> #include <cuda.h> #define NBIN 1000000000//Number of bins #def
INE Num_block//number of thread blocks #define NUM_THREAD//number of threads per BLOCK int tid;
float pi = 0; Kernel that executes on the CUDA device __global__ void Cal_pi (float *sum, int nbin, float step, int nthreads, int nblo
CKS) {int i;
float x; int idx = blockidx.x*blockdim.x+threadidx.x;
Sequential thread index across the blocks for (I=IDX; i< nbin; i+=nthreads*nblocks) {x = (i+0.5) *step;
SUM[IDX] + = 4.0/(1.0+x*x);
}}//main routine that executes on the host int Main (void) {Large_integer frec;
Large_integer STRT;
Large_integer Ed;
QueryPerformanceFrequency (&FREC);
QueryPerformanceCounter (&STRT); DIM3 Dimgrid (num_block,1,1); Grid Dimensions dim3 dimblock (num_thread,1,1); Block Dimensions float *sumhost, *sumdev; Pointer to host & device arrays float step = 1.0/nbin; Step sizE size_t size = num_block*num_thread*sizeof (float); Array Memory Size Sumhost = (float *) malloc (size); Allocate Array on host Cudamalloc ((void * *) &sumdev, size);
Allocate array on device//Initialize array in device to 0 cudamemset (sumdev, 0, size); Do calculation on device Cal_pi <<<dimgrid, dimblock>>> (Sumdev, Nbin, step, Num_thread, NUM_BLOCK); Call CUDA kernel//Retrieve result from device and store it in host array cudamemcpy (Sumhost, Sumdev, size, Cudamem
Cpydevicetohost);
for (tid=0; tid<num_thread*num_block; tid++) pi + = Sumhost[tid];
Pi *= step;
Print results printf ("PI =%f\n", pi);
Cleanup free (sumhost);
Cudafree (Sumdev);
QueryPerformanceCounter (&ed); printf ("%e\n", (ed. Quadpart-strt.quadpart) *1000/frec.
QuadPart);
return 0;
}