Refer to StackOverflow a post processing method: Https://stackoverflow.com/questions/26913683/different-way-to-index-threads-in-cuda-c
The Cuda_gridsize function in the code references YOLO.
The code is as follows:
#include"cuda_runtime.h"#include"Device_launch_parameters.h"#include<stdio.h>#include<stdlib.h>#include<iostream>#include<ctime>using namespacestd;#defineBLOCK 512dim3 cuda_gridsize (size_t N) {size_t k= (N-1)/BLOCK +1; unsignedintx =K; unsignedinty =1; if(X >65535) {x=Ceil (sqrt (k)); Y= (N-1)/(X*block) +1; } dim3 D= {x, y,1 }; //printf ("%ld%ld%ld%ld\n", N, x, y, x*y*block); returnD;} __global__voidGpucalc (unsignedChar*img,LongHLongW) { Longthreadid_2d = threadidx.x + threadidx.y*blockdim.x; Longblockid_2d = blockidx.x + blockidx.y*griddim.x; Longi = threadid_2d + (BLOCKDIM.X*BLOCKDIM.Y) *blockid_2d; //another way to index//Long i = (griddim.x*blockdim.x) * (threadidx.y + blockdim.y*blockidx.y) + (threadidx.x + blockdim.x*blockidx.x); while(I < h*W) {Img[i]=255-Img[i]; I+ = (griddim.x*blockdim.x) * (griddim.y*blockdim.y); }}voidAddwithcuda (unsignedChar*img,LongHLongW) {unsignedChar*dev_a =0; Cudasetdevice (0); Cudamalloc ((void* *) &dev_a, H*w *sizeof(unsignedChar)); cudamemcpy (Dev_a, IMG, H*w *sizeof(unsignedChar), Cudamemcpyhosttodevice); Gpucalc<<<cuda_gridsize (h*w),block>> >(Dev_a, H, W); cudamemcpy (IMG, dev_a, H*w *sizeof(unsignedChar), cudamemcpydevicetohost); Cudafree (dev_a); Cudagetlasterror ();}voidCpucalc (unsignedChar*img,LongWLongH) { for(Longi =0; i < h*w; i++) Img[i]=255-img[i];}intMain () {LongW =20000; LongH =20000; unsignedChar*img =NewUnsignedChar[w*H]; unsignedChar*CMP =NewUnsignedChar[w*H]; for(Longi =0; i < h*w; i++) Img[i]= rand ()% -; memcpy (CMP, IMG, H*W); Cpucalc (IMG, W, H); printf ("CPU Calc end\n"); Addwithcuda (IMG, w,h); printf ("GPU Calc end\n"); BOOLFlag =true; for(Longi =0; i < h*w; i++) { if(Img[i]! =Cmp[i]) {printf ("No pass\n"); Flag=false; Break; } } if(flag) printf ("Pass"); Delete[] CMP; Delete[] img; GetChar (); return 0;}
How to handle the number of arrays in Cuda when they are greater than the number of threads