Nbody cuda:n elements, cut into n/p blocks, each block p elements.
Open n/p block, each block inside a P-thread.
Each thread calculates the force and acceleration of one element and the other n elements.
Each thread will need to calculate the n elements divided into n/p times to complete each calculation of P elements.
Calculate_forces: The total entrance, calculates the force and acceleration of an element and all elements.
Tile_calculation: threads are calculated at a time and the force of P elements.
Bodybodyinteraction: Stress and acceleration between two elements.
A tile in a program is a block of p*p elements.
The force formula is as follows: Universal gravitation formula
__device__ float3 bodybodyinteraction (float4 bi, float4 bj, float3 ai) {FLOAT3 R;//R_ij [3 FLOPS] r.x = bj.x-bi.x; R. y = BJ.Y-BI.Y; R.z = bj.z-bi.z; DISTSQR = Dot (R_ij, R_ij) + eps^2 [6 FLOPS] Float DISTSQR = r.x * r.x + r.y * r.y + r.z * r.z + EPS2; Invdistcube =1/distsqr^ (3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)] float distsixth = DISTSQR * DISTSQR * DISTSQR; float Invdistcube = 1.0f/sqrtf (distsixth); s = M_j * Invdistcube [1 FLOP] float s = bj.w * invdistcube; A_i = a_i + S * R_ij [6 FLOPS] ai.x + = r.x * s; AI.Y + = R.y * s; Ai.z + = R.z * s; return AI; }
__device__ float3 tile_calculation (float4 myposition, float3 accel) {int i; extern __shared__ float4[] shposition;//e Ach thread would process blockdim.x elements. for (i = 0; i < blockdim.x; i++) {accel = Bodybodyinteraction (Myposition, Shposition[i], Accel);} return accel; }
__global__ void calculate_forces (void *devx, void *deva) {extern __shared__ float4[] shposition; FLOAT4 *globalx = (fl OAT4 *) DevX; FLOAT4 *globala = (FLOAT4 *) DevA; FLOAT4 myposition; int I, tile; FLOAT3 acc = {0.0f, 0.0f, 0.0f}; int Gtid = blockidx.x * blockdim.x + threadidx.x; Current element to be processed myposition = Globalx[gtid]; N elements. P Elements is processed parallelly for (i = 0, tile = 0; i < N; i + = P, tile++) {//get element in the tile int idx = Tile * blockdim.x + threadidx.x; SHPOSITION[THREADIDX.X] = Globalx[idx]; Synchronize to make sure this all the elements in the same tile have been fetched into shared memory. __syncthreads (); ACC = tile_calculation (myposition, ACC); Synchronize to make sure all the threads has been finished calculation. __syncthreads (); }//Save The result in global memory for theintegration step. Float4 acc4 = {acc.x, acc.y, acc.z, 0.0f}; Globala[gtid] = acc4;//save the acceleration}