Performance Benchmarking Program for Various normalize functions////by Elvic liang///////////////////////////////////////////////////#include <math.h > #include <xmmintrin.h> #include <time.h>struct vector{float x, y, Z;inline vector () {}inline vector ( Float _x, float _y, float _z): X (_x), Y (_y), Z (_z) {}inline Vector operator * (float RHS) Const{vector temp;temp.x = x * rhs;temp.y = y * rhs;temp.z = z * Rhs;return temp;}}; Template <typename t>inline t max (t A, T B) {return ((a > B)? a:b);} inline float RCPF (float x) {#ifdef _msc_verreturn 1.0f/x; #elseconst __m128 a = _MM_SET_SS (x); const __m128 r = _MM_RCP_SS ( a);//One more Iterationreturn _mm_cvtss_f32 (_MM_SUB_SS (_MM_ADD_SS (R, R), _mm_mul_ss (_mm_mul_ss (R, R), a))); #endif} inline float invsqrtf (float x) {const __m128 a = _MM_MAX_SS (_MM_SET_SS (x), _MM_SET_SS (1.0e-30f)); Const __m128 r = _mm_rsqrt _SS (a);//One more Iterationreturn _MM_CVTSS_F32 (_MM_MUL_SS (R, _mm_add_ss (_MM_SET_SS (1.5f), _mm_mul_ss (_mm_mul_ss (A, _MM_SET_SS ( -0.5f)), _mm_mul_ss (R, R)))) );} inline float fastinvsqrt (float x) {Float xhalf = 0.5f * x;int i = * (int *) &x;i = 0x5f3759df-(i >> 1); x = * (Floa T *) &i;x = x * (1.5f-xhalf * x * x); return x;} inline float fastsqrt (float x) {union {int intpart;float Floatpart;} convertor;union {int intpart;float floatpart;} conver Tor2;convertor.floatpart = X;convertor2.floatpart = X;convertor.intpart = 0x1fbcf800 + (Convertor.intPart >> 1); Convertor2.intpart = 0x5f3759df-(convertor2.intpart >> 1); return 0.5f * (Convertor.floatpart + (x * Convertor2.flo Atpart));} inline float dot (const vector & A, const vector & B) {return (a.x * b.x + a.y * b.y + a.z * b.z);} inline float len (const Vector & a) {const float L = dot (A, a); return Sqrtf (Max (0.0f, L));} Inline vector normalize_ref (const vector & a) {Float length = sqrtf (max (0.0f, a.x * a.x + a.y * a.y + a.z * a.z));//Us ing division gives higher precision than multiplying (1/length) return Vector (A.x/length, A.y/length, a.z/length);} Inline vector Normalize (const vector & a) {return a * INVSQRTF (dot (A, a));} Inline vector normalize_v1 (const vector & a) {const __m128 PA = _MM_MAX_SS (_MM_SET_SS (a.x * a.x + a.y * a.y + a.z * a.z ), _MM_SET_SS (1.0e-30f)), const __m128 r = _MM_RSQRT_SS (PA);//One more iterationconst float d = _mm_cvtss_f32 (_MM_MUL_SS (R , _mm_add_ss (_MM_SET_SS (1.5f), _mm_mul_ss (_MM_MUL_SS (PA, _mm_set_ss ( -0.5f)), _mm_mul_ss (R, R)))); return a * D;} Inline vector normalize_v2 (const vector & a) {return a * FASTINVSQRT (dot (A, a));} Inline vector normalize_v3 (const vector & A) {//todo:use SSE 4.2 dot product intrinsic when availableconst __m128 x = _mm_set_ps (1.0f, A.z, A.Y, a.x); const __m128 s = _mm_mul_ps (x, x); Const __M128 T = _MM_ADD_SS (S, _mm_movehl_ps (s)), const __m128 PA = _MM_MAX_SS (_mm_add_ss (T, _mm_shuffle_ps (T, T, 1)), _ MM_SET_SS (1.0e-30f)); Const __m128 r = _MM_RSQRT_SS (PA);//One MORe Iterationreturn A * _MM_CVTSS_F32 (_MM_MUL_SS (R, _mm_add_ss (_MM_SET_SS (1.5f), _mm_mul_ss (_MM_MUL_SS (PA, _MM_SET_SS ( -0.5f))), _mm_mul_ss (R, R))))); inline float Normalize_len (Vector & R, Const vector & a) {const float L = len (a); Const float d = max (L, 1.0e-30f); r = A * RCPF (d); return D;} inline float normalize_len_v1 (Vector & R, Const vector & a) {const float d = sqrtf (max (1.0e-30f, a.x * a.x + a.y * A.Y + a.z * a.z)); r = A * RCPF (d); return D;} inline float normalize_len_v2 (Vector & R, Const vector & a) {const float d = sqrtf (max (1.0e-30f, a.x * a.x + a.y * A.Y + a.z * a.z)); Const __m128 PA = _MM_SET_SS (d); const __M128 PR = _MM_RCP_SS (PA);//One more iterationconst Float rd = _ MM_CVTSS_F32 (_MM_SUB_SS (PR, PR), _MM_MUL_SS (_MM_MUL_SS (PR, PR), PA))); r = A * Rd;return D;} inline float normalize_len_v3 (Vector & R, Const vector & a) {const __m128 PA = _MM_SQRT_SS (_mm_max_ss (_MM_SET_SS (1). 0e-30f), _MM_SET_SS (a.x * a.x + a.y * a.y + a.z * a.z)); const __M128 Pr = _MM_RCP_SS (PA);//One more iterationconst float rd = _mm_cvtss_f32 (_MM_SUB_SS (_MM_ADD_SS (PR, PR), _MM_MUL_SS (_mm_mul_ SS (PR, PR), PA))); r = A * Rd;return _mm_cvtss_f32 (PA);} inline float normalize_len_v4 (Vector & R, Const vector & a) {const float d = fastsqrt (max (1.0e-30f, a.x * a.x + A.Y * A.Y + a.z * a.z)); r = A * RCPF (d); return D;} inline float normalize_len_v5 (Vector & R, Const vector & A) {//todo:use SSE 4.2 dot product intrinsic when Availa Bleconst __m128 x = _mm_set_ps (1.0f, A.z, A.Y, a.x); const __m128 s = _mm_mul_ps (x, x); Const __M128 T = _MM_ADD_SS (S, _mm_movehl_ps (s)), const __m128 PA = _MM_SQRT_SS (_mm_max_ss (_MM_ADD_SS (t, _mm_shuffle_ps (t, T, 1)), _MM_SET_SS (1.0e-30f)), const __m128 PR = _MM_RCP_SS (PA);//One more ITERATIONR = A * _MM_CVTSS_F32 (_MM_SUB_SS ( _MM_ADD_SS (PR, PR), _MM_MUL_SS (_MM_MUL_SS (PR, PR), PA)); return _mm_cvtss_f32 (PA); struct random{unsigned int state;inline Random (unsigned int seed = 0x9e3779b1) {state = hash (seed);} inline unsigned inT hash (unsigned int a) {a = (A+0X7ED55D16) + (a<<12); a = (a^0xc761c23c) ^ (a>>19); a = (A+0X165667B1) + (A<&L T;5); a = (a+0xd3a2646c) ^ (a<<9); a = (A+0XFD7046C5) + (a<<3); a = (a^0xb55a4f09) ^ (a>>16); return A;} inline float next_float () {state = hash (state), Return (state & 0xFFFFFF) * (1.0f/float (1 << 24));} inline float Next () {return (Next_float () * 1000.0f-500.0f);}}; int Get_time () {return (int) clock ();} int main (int argc, char* argv[]) {const int ntest = 100000000;int Rand_time = 0;{ int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector v;v.x = Random.next (); v.y = Random.next (); v.z = R Andom.next (); sum + = (v.x + v.y + v.z);} Rand_time = Get_time ()-start_time;printf ("random:sum =%f time =%d\n", sum, rand_time);} printf ("Testing performance...\n"); {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.nexT (); v.z = Random.next (); r = Normalize_ref (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-Start_time-rand_time;printf ("Normalize_ref (reference): sum =%f time =%d\n", sum , done_time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); r = Normalize (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize:sum =%f time =%d\n", sum, done_time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); r = Normalize_v1 (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_v1:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next ();V.Y = Random.next (); v.z = Random.next (); r = Normalize_v2 (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-Start_time-rand_time;printf ("Normalize_v2 (FAST): sum =%f time =%d\n", Sum, done _time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); r = Normalize_v3 (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_v3:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len:sum =%f time =%d\n", sum, done_time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = RaNdom.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v1 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v1:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v2 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v2:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v3 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v3:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {VecTor V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v4 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-Start_time-rand_time;printf ("Normalize_len_v4 (FAST): sum =%f time =%d\n", Sum, done _time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v5 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v5:sum =%f time =%d\n", sum, done_time); }printf ("Testing precision...\n"); {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next () ; v.z = Random.next (); r1 = Normalize_ref (v); r2 = Normalize_v1 (v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("Normalize_v1:sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next () ; v.z = Random.next (); r1 = Normalize_ref (v); r2 = Normalize_v2 (v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("Normalize_v2 (FAST): sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next () ; v.z = Random.next (); r1 = Normalize_ref (v); r2 = Normalize_v3 (v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("normalize_v3:sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.neXT (); v.y = Random.next (); v.z = Random.next (); r1 = Normalize_ref (v); Normalize_len_v3 (R2, v); sum + = (r1.x + r1.y + r1.z + R2 . x + r2.y + r2.z); max_error = max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("normalize_len_v3:sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next () ; v.z = Random.next (); r1 = Normalize_ref (v); NORMALIZE_LEN_V4 (R2, v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_er ROR = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("Normalize_len_v4 (FAST): sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next () ; v.z = Random.next (); r1 = Normalize_ref (v); NORMALIZE_LEN_V5 (R2, v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_eRror = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("normalize_len_v5:sum =%f max. Error =%.17f\n ", sum, max_error);} return 0;}
Final conclusion: Normalize_v3 is the most cost-effective. If you use normalize in 3D games or other high-interactivity situations, consider using this fast implementation, which requires only the platform to support SSE 2.0. If you have a low precision requirement, consider using the NORMALIZE_V2 (fast) version of the implementation, which uses the fast open square root algorithm in the Quake 3 game engine.
Originality: The performance and precision of various normalize functions