Let's take a look at the GPU test sample for the OPENCV background modeling algorithm:
#include <iostream> #include <string> #include "opencv2/core.hpp" #include "opencv2/core/utility.hpp" # Include "Opencv2/cudabgsegm.hpp" #include "opencv2/cudalegacy.hpp" #include "opencv2/video.hpp" #include "opencv2/
HIGHGUI.HPP "using namespace std;
using namespace CV;
using namespace Cv::cuda;
Enum Method {MOG, MOG2, GMG, fgd_stat}; int main (int argc, const char** argv) {cv::commandlineparser cmd (argc, argv, "{C camera | | Use camera} "" {F file |.. /data/768x576.avi | Input video File} "" {M method | Mog | Method (Mog, MOG2, GMG, FGD)} "" "{H Help | |
Print help Message} ");
if (Cmd.has ("Help") | | |!cmd.check ()) {cmd.printmessage ();
Cmd.printerrors ();
return 0;
} bool Usecamera = Cmd.has ("Camera");
string file = cmd.get<string> ("file");
String method = Cmd.get<string> ("method"); if (Method! = "MOG" &Amp;& Method! = "Mog2" && Method! = "GMG" && Method! = "FGD") {Cerr <&L T
"Incorrect method" << Endl;
return-1; } Method m = method = = "Mog"? Mog:method = = "Mog2"? Mog2:method = = "FGD"?
FGD_STAT:GMG;
Videocapture cap;
if (Usecamera) cap.open (0);
else Cap.open (file);
if (!cap.isopened ()) {Cerr << "can not open camera or video file" << Endl;
return-1;
} Mat frame;
Cap >> frame;
Gpumat D_frame (frame);
ptr<backgroundsubtractor> mog = Cuda::createbackgroundsubtractormog ();
ptr<backgroundsubtractor> mog2 = cuda::createbackgroundsubtractormog2 ();
ptr<backgroundsubtractor> GMG = CUDA::CREATEBACKGROUNDSUBTRACTORGMG (40);
ptr<backgroundsubtractor> FGD = CUDA::CREATEBACKGROUNDSUBTRACTORFGD ();
Gpumat D_fgmask; GpUmat d_fgimg;
Gpumat d_bgimg;
Mat Fgmask;
Mat fgimg;
Mat bgimg;
Switch (m) {case mog:mog->apply (d_frame, D_fgmask, 0.01);
Break
Case Mog2:mog2->apply (D_frame, D_fgmask);
Break
Case Gmg:gmg->apply (D_frame, D_fgmask);
Break
Case Fgd_stat:fgd->apply (D_frame, D_fgmask);
Break
} namedwindow ("image", window_normal);
Namedwindow ("Foreground mask", window_normal);
Namedwindow ("Foreground image", window_normal);
if (m! = GMG) {Namedwindow ("Mean background image", window_normal);
} for (;;)
{Cap >> frame;
if (Frame.empty ()) break;
D_frame.upload (frame);
Int64 start = Cv::gettickcount ();
Update the model switch (m) {case mog:mog->apply (d_frame, D_fgmask, 0.01);
Mog->getbackgroundimage (D_BGIMG);Break
Case Mog2:mog2->apply (D_frame, D_fgmask);
Mog2->getbackgroundimage (D_BGIMG);
Break
Case Gmg:gmg->apply (D_frame, D_fgmask);
Break
Case Fgd_stat:fgd->apply (D_frame, D_fgmask);
Fgd->getbackgroundimage (D_BGIMG);
Break
} Double fps = Cv::gettickfrequency ()/(Cv::gettickcount ()-start);
Std::cout << "fps:" << fps << Std::endl;
D_fgimg.create (D_frame.size (), D_frame.type ());
D_fgimg.setto (Scalar::all (0));
D_frame.copyto (d_fgimg, D_fgmask);
D_fgmask.download (Fgmask);
D_fgimg.download (FGIMG);
if (!d_bgimg.empty ()) d_bgimg.download (bgimg);
Imshow ("image", frame);
Imshow ("Foreground mask", fgmask);
Imshow ("Foreground image", fgimg);
if (!bgimg.empty ()) imshow ("Mean background image", bgimg); int key = Waitkey (30);
if (key = =) break;
} return 0;
}
OPENCV provides some basic processing of CUDA programming, such as copying images from CPU to GPU (Mat-to-Gpumat), Upload,download. OPENCV encapsulation and shielding the cuda underlying functions, this has the advantage and disadvantage, for some people interested in algorithmic applications, very good, as long as a few lines of code, you can use the GPU parallel version of the algorithm, but for those of us who research algorithms parallel implementation of details, it is not very convenient. And the more convenient the upper package, the lower layer is more difficult to pull away. So here I'm supposed to have acquired the image data of Uchar or UNCHAR3. And the process from mat to Gpumat is slow ... The data types in the OPENCV package are described in another Vibe blog.
In contrast, I prefer this way of opening:
__global__ void Swap_rb_kernel (const uchar3* src,uchar3* dst,int width,int height) {int x = threadidx.x + blockidx.x
* BLOCKDIM.X;
int y = threadidx.x + blockidx.y * BLOCKDIM.Y;
if (x < width && y < height) {Uchar3 v = src[y * width + x];
Dst[y * width + x].x = v.z;
Dst[y * width + x].y = v.y;
Dst[y * width + x].z = v.x;
}} void Swap_rb_caller (const uchar3* src,uchar3* dst,int width,int height) {dim3 block (32,8);
Dim3 Grid ((width + block.x-1)/block.x, (height + block.y-1)/block.y);
Swap_rb_kernel<<<grid,block,0>>> (Src,dst,width,height);
Cudathreadsynchronize ();
} int main () {Mat image = Imread ("lena.jpg");
Imshow ("src", image);
size_t memsize = image.cols*image.rows*sizeof (UCHAR3);
uchar3* d_src = NULL;
uchar3* d_dst = NULL;
Cuda_safe_call (Cudamalloc ((void**) &d_src,memsize));
Cuda_safe_call (Cudamalloc ((void**) &d_dst,memsize)); Cuda_safe_call (Cudamempcy (D_src,image.data,memsize,cudamemcpyhosttodevice));
Swap_rb_caller (d_src,d_dst,image.cols,image.rows);
Cuda_safe_call (Cudamempcy (image.data,d_dst,memsize,cudamemcpydevicetohost));
Imshow ("GPU", image);
Waitkey (0);
Cuda_safe_call (Cudafree (D_SRC));
Cuda_safe_call (Cudafree (D_DST));
return 0; }
Therefore, we will pay more attention to the __DEVICE__ function implementation or kernel function of the algorithm.
Reference: http://www.cnblogs.com/dwdxdy/p/3528711.html