GPU Sort __gpu

Source: Internet
Author: User
Tags volatile

Single version of the two-tone ordering can be referred to http://blog.csdn.net/sunmenggmail/article/details/42869235

Or is this picture



The idea of two-tone sorting based on Cuda is:

Provides a thread for each element, or 1024 threads if it is greater than 1024 elements, because the __syncthreads can only be synchronized as a thread within the block, and a block has a maximum of 1024 threads, If the number of elements is greater than 1024, each thread may be responsible for more than one element of the comparison


For the above illustration, a rectangle represents a multiple-threading comparison, so the graph needs to be compared 6 times to have the right output.


#include <vector> #include <algorithm> #include <iostream> #include <time.h> #include <sys/ time.h> #include <string.h> #include <math.h> #include <stdlib.h> #include <stdio.h> using NA

Mespace std;  #define CHECK_EQ1 (a,b) do {\ if ((a)!= (b)) {\ cout <<__FILE__<<: "<< __line__<<": CHECK failed because "<<a<<"!= "<<b<<endl;\ cout << cudageterrorstring (a) <<endl;\ exit ( 1); \}
	\} while (0) #define CUDA_CHECK (condition) \ Do {\ cudaerror_t error = Condition;\ CHECK_EQ1 (Error, cudasuccess); \}
    while (0) static __device__ __forceinline__ unsigned int __btflo (unsigned int word) {unsigned int ret;
	ASM volatile ("Bfind.u32%0,%1;": "=r" (ret): "R" (word)); Return the index of highest non-zero bit in a word;
For example, 00000110, return 2 return ret; //for > 1024 __global__ void bigbinoticsort (unsigned int *arr, int len, unsigned int *buf) {unsigned len2 = 1 << (__btflo (len-1u) + 1);//unsigned int MAX = 0XFFFFFFFFU;
	unsigned id = threadidx.x;
	if (ID >= len2) return;
	unsigned iter = blockdim.x;
		for (unsigned i = ID; i < len2 i = iter) {if (i >= len) {Buf[i-len] = MAX;
	
	} __syncthreads ();
	int count = 0;  for (unsigned k = 2; k <= len2. k*=2) {for (unsigned j = k >> 1; j > 0; J >>= 1) {for (unsigned I = ID; i < len2;
				
				i + iter) {unsigned swapidx = i ^ j;
					if (Swapidx > i) {unsigned myelem, other;
					if (i < len) Myelem = Arr[i];
					
					else Myelem = Buf[i-len];
					if (Swapidx < len) other = Arr[swapidx];
					
					else other = Buf[swapidx-len];
					
					BOOL swap = false;
					if ((I & k) ==0 && Myelem > Other) swap = true;
					
					if ((i & k) = = k && Myelem < other) swap = true; 
						if (swap) {if (Swapidx < len) Arr[swapidx] = Myelem;
				else Buf[swapidx-len] = Myelem;		
						if (i < len) arr[i] = other;
					else Buf[i-len] = other;
			
			
		}} __syncthreads ();
	}}//for <= 1024 __global__ void binoticsort (unsigned int *arr, int len) {__shared__ unsigned int buf[1024];
	BUF[THREADIDX.X] = (threadidx.x < len arr[threadidx.x]: 0xffffffffu);
	
	__syncthreads (); for (unsigned k = 2; k <= blockdim.x. k*=2) {//buid k elements ascend or descend for (unsigned j = k >> 1; J &G T 0;
			J >>= 1) {//merge longer binotic into shorter binotic unsigned swapidx = threadidx.x ^ j;
			unsigned myelem = buf[threadidx.x];
			
			unsigned other = Buf[swapidx];
			
			__syncthreads ();
			unsigned ascend = k * (Swapidx < threadidx.x);
			unsigned descend = k * (Swapidx > threadidx.x); If I is front, the swap is back; Ascend = 0, descend = k//if I is, and swap is front;
			Ascend = k, descend = 0;
			BOOL swap = false;
			if ((threadidx.x & k) = = Ascend) {if (Myelem > Other) swap = true;
	}		if ((threadidx.x & k) = = descend) {if (Myelem < other) swap = true;
			} if (swap) buf[swapidx] = Myelem;
		__syncthreads ();
} if (threadidx.x < len) arr[threadidx.x] = buf[threadidx.x]; } template<class t> inline void Printvec (T *vec, int len) {for (int i = 0; i < len; ++i) cout <<vec[i] &
	lt;< "T";
cout << Endl;
	} template<class t> inline void Printvecg (t *gvec, int len) {t *vec = (t*) malloc (sizeof (t) *len);
	Cuda_check (cudamemcpy (vec,gvec,sizeof (T) *len,cudamemcpydevicetohost));
	Printvec (Vec,len);
Free (VEC);
	} void Linesize (int n, int &nblocks, int &nthreads) {if (n <= 1024) {nthreads = (n + 32-1)/32*32;//}
	else {nblocks = (N + 1024-1)/1024;
	} bool Validate (unsigned *gvec, int len) {unsigned *vec = (unsigned*) malloc (sizeof (unsigned) *len);
	Cuda_check (cudamemcpy (vec,gvec,sizeof (unsigned) *len,cudamemcpydevicetohost)); for (int i = 1; i < Len; ++i) {if (Vec[i] <= vec[i-1]) retUrn false;
return true;
	inline int roundUpPower2 (int v) {v--;
	V |= v >> 1;
	V |= v >> 2;
	V |= v >> 4;
	V |= v >> 8;

	V |= v >> 16;
	v++;
return v;
		int main (int argc, char *argv[]) {if (argc!= 2) {cout << "len \";
	Return
	int len = atoi (argv[1]);
	unsigned int *arr = (unsigned int*) malloc (sizeof (unsigned int) *len);
	for (int i = 0; i < len; ++i) arr[i] = i;
	Srand ((unsigned int) time (NULL));
		for (int i = len; I >= 2; i) {int j = rand ()% i;
	Swap (Arr[i-1], arr[j]);
	} unsigned* Debug;
	
	Cuda_check (Cudamalloc (void**) &debug, sizeof (unsigned) *1000));
	Unsigned int* Darr, *buf;
	Cuda_check (Cudamalloc (void**) &darr, sizeof (unsigned int) *len));
	Cuda_check (Cudamalloc (void**) &buf, sizeof (unsigned int) *len));
	
	
	Cuda_check (cudamemcpy (Darr, arr, sizeof (unsigned int) *len));
	Bigbinoticsort<<<1,1024>>> (Darr,len, buf);
	Cuda_check (Cudapeekatlasterror ());Cuda_check (Cudadevicesynchronize ());
	if (Validate (Darr, Len)) cout << "yes\n";
	
	
	else cout << "no\n";
return 1; }


The algorithm has two two-tone sorting implementations, one for less than 1024 elements, the use of shared memory to speed up access, but if you really want to sort 1024 of the following elements, it is recommended to use the CPU version of the fast, GPU on the speed and no obvious advantages, even slower than the CPU


If it is greater than 1024 elements, another method is used. The disadvantage of this approach is also obvious is that no matter how many elements, can only use a block to calculate, and a block can only use 1024 threads, estimated within 10,000 elements, this method is the fastest GPU.


After my test, including thrust sort (cardinal order), only the number of elements more than 5,000, the GPU on the sorting algorithm has obvious advantages. About 100,000 of the elements, the GPU on the sorting algorithm than the CPU has a speed of 100 times times.


A quick sort on the GPU is described below. GPU fast sorting can handle very large data, but there will be a recursive depth limit, when the recursive depth is exceeded, you can invoke the above mentioned two-tone sorting processing. Tests have shown that speed is faster than thrust.


Quick-row main reference sample Nvidia_cuda-6.5_samples/6_advanced/cdpadvancedquicksort on GPU

The row handles only arrays that are larger than 1024 elements, and then divides them into left and right two, and if the length of the child array is greater than 1024, the row continues to be dynamically recursive, and if less than 1024, the double-tune sort is invoked dynamically. If the recursive depth of the fast row has exceeded the maximum recursive depth (cuda maximum nesting depth 64, but also limited to the memory size used at each level), the double order is called directly.


The best part of this program is the separation function.

The array is delimited by warp size, each warp handles 32 elements, and the global atomicadd function obtains the offset of less than and greater than pivot in the array, noting that in the same warp, the offset address is the same, Each thread then places its own element at the offset address, which completes the split


Note that this fast row is not in-place, but also involves recursive calls, so you have to deal with the original array and buffer exchange


Since Cuda does not have an explicit lock, this method uses a special loop queue, and I think that in extreme cases there may be problems

(The code here is wrong, there is no swap to handle the original array and buffer, just to help understand.) Please refer to the samples for the correct code.

#define QSORT_BLOCKSIZE_SHIFT 9 #define QSORT_BLOCKSIZE (1 << qsort_blocksize_shift) #define Bitonicsort_l
EN 1024//must is power of 2! #define QSORT_MAXDEPTH/Force final bitonic stage at depth qsort_maxdepth+1 #define Qsort_

Stack_elems 1*1024*1024//One million stack elements is a HUGE number.    typedef struct __ALIGN__ (128) qsortatomicdata_t {volatile unsigned int lt_offset;    Current output offset for <pivot volatile unsigned int gt_offset; Current output offset for >pivot volatile unsigned int sorted_count;        Total count sorted, for deciding then to launch next wave volatile unsigned int index; RINGBUF Tracking Index.
Can is ignored if not using RINGBUF.

} Qsortatomicdata; A Ring-buffer for Rapid stack Allocation/////////////////////////////////////////////////////////////////////////typedef struct QSORTRINGBUF_T {volatile unsigned int head;//1//Head Pointer-we allocate from volatile unsigned int tail;  0//Tail pointer-indicates last still-in-use element volatile unsigned int count;//0//Total count  allocated volatile unsigned int max;  0//Max index allocated unsigned int stacksize;           Wrap-around size of buffer (must is power of 2) volatile void *stackbase;


Pointer to the stack we ' re allocating from} Qsortringbuf; /* For Cuda has no lock, so we have to does like This:if alloc, ++head if free, ++tail so [tail, head) contains alloced C Hunks;  Head point to "next free chunk count record" the number of chunks had free we have n chunks, but the index of a chunk is Increase when Re-alloc Max record the maximum index to the free chunks only if the chunks before Max are all free, aka, M Ax = = count, we can alter tail value */template<class t> static __device__ void Ringbuffree (Qsortringbuf *ringbuf, T *data) {unsigned index = data->index;
	unsigned count = Atomicadd (unsigned*) & (Ringbuf->count), 1) + 1;
	
	unsigned max = Atomicmax (unsigned*) & (Ringbuf->max), index + 1);
	if (Max < (index + 1)) max = index + 1;
	if (max = = count) {Atomicmax (unsigned*) & (Ringbuf->tail), count);
	} template<class t> static __device__ t* ringbufalloc (qsortringbuf *ringbuf) {unsigned int loop = 10000;
	while (((ringbuf->head-ringbuf->tail) >= ringbuf->stacksize) && (loop--> 0));
	
	if (loop = = 0) return NULL;
	Unsigned index = Atomicadd ((unsigned*) &ringbuf->head, 1);
	T *ret = (t*) (ringbuf->stackbase) + (Index & (ringbuf->stacksize-1));
	
	Ret->index = index;
return ret;
                           } __global__ void Qsort_warp (unsigned *indata, unsigned *outdata,
    unsigned int offset,//0 unsigned int len,//                       Qsortatomicdata *atomicdata,//stack qsortringbuf *atomicdatastack,//ring buf unsigned int source_is_indata,//true unsigned int depth) {//p
	    rintf ("depth =%d", depth); Find I data offset, based on warp ID unsigned int thread_id = threadidx.x + (blockidx.x << qsort_blocksize_s
    Hift);   unsigned int warp_id = threadidx.x >> 5; Used for debug only unsigned int lane_id = threadidx.x & (warpSize-1);//%32//Exit if I ' m outside the RA

    Nge of sort to is done if (thread_id >= len) return; The algorithm.
    Each warp counts the number of elements that are//greater/less than the pivot.
    When a warp knows its count, it updates a atomic counter. Read in the data and the pivot.
    Arbitrary pivot selection for now.
    unsigned pivot = indata[offset + LEN/2]; unsigned data = Indata[offset + thread_id];
    Count How many are <= and how many are > pivot. If all are <= pivot then we adjust the comparison//because otherwise the sort would move nothing and//we '
    ll iterate forever.
    unsigned int greater = (Data > Pivot); unsigned int gt_mask = __ballot (greater)//evaluate predicate for all active threads of the warp SE nth bit is set if and only if predicate evaluates to Non-zero for the nth thread of the warp and the nth thread is ACTI
	
	Ve.
		if (Gt_mask = = 0) {greater = (data >= pivot);
	Gt_mask = __ballot (greater);
	} unsigned lt_mask = __ballot (!greater);
	unsigned gt_count = __POPC (gt_mask);//count number of 1 in a warp;
	
	unsigned lt_count = __POPC (Lt_mask);
	Only thread 0 in Warp Calc//find 2 new positions to this warp unsigned lt_oft, gt_oft; if (lane_id = = 0) {if (Lt_count > 0) lt_oft = Atomicadd ((unsigned*) &atomicdata->lt_offset, lt_count);//ato Micadd RETurn old value, not the Newer//all the warps would syn call this if (Gt_count > 0) gt_oft = Len-(atomicadd
		
		ned*) &atomicdata->gt_offset, Gt_count) + Gt_count);
		printf ("depth =%d\n", depth);
		printf ("pivot =%u\n", pivot); printf ("Lt_count%u lt_oft%u gt_count%u gt_oft%u atomicdatagtoffset%u\n", Lt_count,lt_oft, Gt_count,gt_oft, AtomicD
	Ata->gt_offset);
	} Lt_oft = __SHFL ((int) lt_oft, 0);
	 Gt_oft = __SHFL ((int) gt_oft, 0)//everyone pulls the offsets from Lane 0 __syncthreads (); Now compute I own personal offset within this. I need to know how many//threads with a lane ID less than mine are going to write to the same buffer//as me.
    We can use the POPC to implement a single-operation warp scan in this case.
    unsigned lane_mask_lt; 
	
	ASM ("Mov.u32%0,%%lanemask_lt;": "=r" (lane_mask_lt);//bits set in positions less than the thread ' s lane number The Warp unsigned my_mask = Greater?
	Gt_mask:lt_mask; unsigned my_oft= __popc (My_mask & LANE_MASK_LT);////move data my_oft + = Greater?
	Gt_oft:lt_oft;
	Outdata[offset + my_oft] = data;
	
	__syncthreads ();
	if (lane_id = 0) printf ("pivot =%d", pivot);
		if (lane_id = = 0) {/*if (blockidx.x = = 0) {printf ("depth =%d\n", depth);
		for (int i = 0; i < len; ++i) printf ("%u", outdata[offset+i]);
		printf ("\ n");
		}*/unsigned mycount = Lt_count + gt_count; We are the last warp if (Atomicadd (unsigned*) &atomicdata->sorted_count, mycount) + mycount = len) {Unsig
			Ned Lt_len = atomicdata->lt_offset;
				
			unsigned gt_len = atomicdata->gt_offset;
            Cudastream_t Lstream, Rstream;
            Cudastreamcreatewithflags (&lstream, cudastreamnonblocking);
			
			Cudastreamcreatewithflags (&rstream, cudastreamnonblocking);
			
			Ringbuffree<qsortatomicdata> (Atomicdatastack, atomicdata);
			if (Lt_len = = 0) return;
Left if (Lt_len > Bitonicsort_len) {				if (depth >= qsort_maxdepth) {bigbinoticsort<<<1, bitonicsort_len,0, rstream>>> (Outdata + O
				Ffset, Lt_len, indata + offset);
                        else {if ((Atomicdata = ringbufalloc<qsortatomicdata> (atomicdatastack)) = = NULL) printf ("Stack-allocation error.")
					Failing left child launch.\n ");
                        else {atomicdata->lt_offset = Atomicdata->gt_offset = Atomicdata->sorted_count = 0;
                        unsigned int numblocks = (unsigned int) (lt_len+ (qsort_blocksize-1)) >> Qsort_blocksize_shift; qsort_warp<<< numblocks, qsort_blocksize, 0, Lstream >>> (outdata, Indata, offset, Lt_len, ATOMICDA
					Ta, Atomicdatastack, True, depth+1);
                }} else if (Lt_len > 1) {unsigned int bitonic_len = 1 << (__btflo (lt_len-1u) +1);
			binoticsort<<< 1, Bitonic_len, 0, Lstream >>> (Outdata + Offset,lt_len);
}			Right if (Gt_len > Bitonicsort_len) {if (depth >= qsort_maxdepth) Bigbino 
				Ticsort<<<1, bitonicsort_len,0, rstream>>> (outdata + offset + lt_len, Gt_len, Indata + offset + lt_len);
                        else {if ((Atomicdata = ringbufalloc<qsortatomicdata> (atomicdatastack)) = = NULL) printf ("Stack allocation error!
					Failing right-side launch.\n ");
                        else {atomicdata->lt_offset = Atomicdata->gt_offset = Atomicdata->sorted_count = 0;
                        unsigned int numblocks = (unsigned int) (gt_len+ (qsort_blocksize-1)) >> Qsort_blocksize_shift; qsort_warp<<< numblocks, qsort_blocksize, 0, Rstream >>> (Outdata, Indata, Offset+lt_len, Gt_len, a
					Tomicdata, Atomicdatastack, True, depth+1);
                }} else if (Gt_len > 1) {unsigned int bitonic_len = 1 << (__btflo (gt_len-1u) +1); binoticsort<<< 1, Bitonic_len, 0, Rstream >>> (outdata + offset + lt_len,gt_len); }} void Runqsort (unsigned *gpudata, unsigned *scratchdata, unsigned int count, cudastream_t stream) {unsign
    ed int stacksize = qsort_stack_elems;//1*1024*1024//This are the STACK, for atomic tracking of each sort ' s status
    Qsortatomicdata *gpustack;
    Cuda_check (Cudamalloc (void * *) &gpustack, stacksize * sizeof (qsortatomicdata));     Cuda_check (Cudamemset (gpustack, 0, sizeof (qsortatomicdata)));
    Only need set a entry to 0//Create the memory Ringbuffer used for handling the stack.
    Initialise everything to where it needs.
    Qsortringbuf buf;
    Qsortringbuf *ringbuf;
    Cuda_check (Cudamalloc (void * *) &ringbuf, sizeof (QSORTRINGBUF));           Buf.head = 1;
    We start with one allocation buf.tail = 0;
    Buf.count = 0;
    Buf.max = 0;
    Buf.stacksize = stacksize;
    Buf.stackbase = Gpustack; Cuda_check (cudamemcpy (RINGBUF, &AMP;BUF, sizeof (BUF), cudamemcpyhosttodevice)); if (Count > Bitonicsort_len)//1024 {//qsort_blocksize = 2^9 = unsigned int numblocks = (unsigned int
        ) (count+ (qsort_blocksize-1)) >> Qsort_blocksize_shift; qsort_warp<<< numblocks, qsort_blocksize, 0, Stream >>> (Gpudata, Scratchdata, 0U, Count, Gpustack,
		
   Ringbuf, True, 0);
		else {binoticsort<<< 1, Bitonicsort_len >>> (Gpudata, Count);
    Cuda_check (cudamemcpy (Scratchdata, gpudata, sizeof (unsigned) *count, cudamemcpydevicetodevice));
} cudadevicesynchronize (); }



Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.