Originality: The performance and precision of various normalize functions

Source: Internet
Author: User
Tags square root

Performance Benchmarking Program for Various normalize functions////by Elvic liang///////////////////////////////////////////////////#include <math.h > #include <xmmintrin.h> #include <time.h>struct vector{float x, y, Z;inline vector () {}inline vector ( Float _x, float _y, float _z): X (_x), Y (_y), Z (_z) {}inline Vector operator * (float RHS) Const{vector temp;temp.x = x * rhs;temp.y = y * rhs;temp.z = z * Rhs;return temp;}}; Template <typename t>inline t max (t A, T B) {return ((a > B)? a:b);} inline float RCPF (float x) {#ifdef _msc_verreturn 1.0f/x; #elseconst __m128 a = _MM_SET_SS (x); const __m128 r = _MM_RCP_SS ( a);//One more Iterationreturn _mm_cvtss_f32 (_MM_SUB_SS (_MM_ADD_SS (R, R), _mm_mul_ss (_mm_mul_ss (R, R), a))); #endif} inline float invsqrtf (float x) {const __m128 a = _MM_MAX_SS (_MM_SET_SS (x), _MM_SET_SS (1.0e-30f)); Const __m128 r = _mm_rsqrt _SS (a);//One more Iterationreturn _MM_CVTSS_F32 (_MM_MUL_SS (R, _mm_add_ss (_MM_SET_SS (1.5f), _mm_mul_ss (_mm_mul_ss (A, _MM_SET_SS ( -0.5f)), _mm_mul_ss (R, R)))) );} inline float fastinvsqrt (float x) {Float xhalf = 0.5f * x;int i = * (int *) &x;i = 0x5f3759df-(i >> 1); x = * (Floa T *) &i;x = x * (1.5f-xhalf * x * x); return x;} inline float fastsqrt (float x) {union {int intpart;float Floatpart;} convertor;union {int intpart;float floatpart;} conver Tor2;convertor.floatpart = X;convertor2.floatpart = X;convertor.intpart = 0x1fbcf800 + (Convertor.intPart >> 1); Convertor2.intpart = 0x5f3759df-(convertor2.intpart >> 1); return 0.5f * (Convertor.floatpart + (x * Convertor2.flo Atpart));} inline float dot (const vector & A, const vector & B) {return (a.x * b.x + a.y * b.y + a.z * b.z);} inline float len (const Vector & a) {const float L = dot (A, a); return Sqrtf (Max (0.0f, L));} Inline vector normalize_ref (const vector & a) {Float length = sqrtf (max (0.0f, a.x * a.x + a.y * a.y + a.z * a.z));//Us ing division gives higher precision than multiplying (1/length) return Vector (A.x/length, A.y/length, a.z/length);} Inline vector Normalize (const vector & a) {return a * INVSQRTF (dot (A, a));} Inline vector normalize_v1 (const vector & a) {const __m128 PA = _MM_MAX_SS (_MM_SET_SS (a.x * a.x + a.y * a.y + a.z * a.z ), _MM_SET_SS (1.0e-30f)), const __m128 r = _MM_RSQRT_SS (PA);//One more iterationconst float d = _mm_cvtss_f32 (_MM_MUL_SS (R , _mm_add_ss (_MM_SET_SS (1.5f), _mm_mul_ss (_MM_MUL_SS (PA, _mm_set_ss ( -0.5f)), _mm_mul_ss (R, R)))); return a * D;} Inline vector normalize_v2 (const vector & a) {return a * FASTINVSQRT (dot (A, a));}  Inline vector normalize_v3 (const vector & A) {//todo:use SSE 4.2 dot product intrinsic when availableconst __m128 x =    _mm_set_ps (1.0f, A.z, A.Y, a.x); const __m128 s = _mm_mul_ps (x, x); Const __M128 T = _MM_ADD_SS (S, _mm_movehl_ps (s)), const __m128 PA = _MM_MAX_SS (_mm_add_ss (T, _mm_shuffle_ps (T, T, 1)), _ MM_SET_SS (1.0e-30f)); Const __m128 r = _MM_RSQRT_SS (PA);//One MORe Iterationreturn A * _MM_CVTSS_F32 (_MM_MUL_SS (R, _mm_add_ss (_MM_SET_SS (1.5f), _mm_mul_ss (_MM_MUL_SS (PA, _MM_SET_SS ( -0.5f))), _mm_mul_ss (R, R)))));  inline float Normalize_len (Vector & R, Const vector & a) {const float L = len (a); Const float d = max (L, 1.0e-30f); r = A * RCPF (d); return D;} inline float normalize_len_v1 (Vector & R, Const vector & a) {const float d = sqrtf (max (1.0e-30f, a.x * a.x + a.y * A.Y + a.z * a.z)); r = A * RCPF (d); return D;} inline float normalize_len_v2 (Vector & R, Const vector & a) {const float d = sqrtf (max (1.0e-30f, a.x * a.x + a.y * A.Y + a.z * a.z)); Const __m128 PA = _MM_SET_SS (d); const __M128 PR = _MM_RCP_SS (PA);//One more iterationconst Float rd = _ MM_CVTSS_F32 (_MM_SUB_SS (PR, PR), _MM_MUL_SS (_MM_MUL_SS (PR, PR), PA))); r = A * Rd;return D;} inline float normalize_len_v3 (Vector & R, Const vector & a) {const __m128 PA = _MM_SQRT_SS (_mm_max_ss (_MM_SET_SS (1). 0e-30f), _MM_SET_SS (a.x * a.x + a.y * a.y + a.z * a.z)); const __M128 Pr = _MM_RCP_SS (PA);//One more iterationconst float rd = _mm_cvtss_f32 (_MM_SUB_SS (_MM_ADD_SS (PR, PR), _MM_MUL_SS (_mm_mul_ SS (PR, PR), PA))); r = A * Rd;return _mm_cvtss_f32 (PA);}  inline float normalize_len_v4 (Vector & R, Const vector & a) {const float d = fastsqrt (max (1.0e-30f, a.x * a.x + A.Y * A.Y + a.z * a.z)); r = A * RCPF (d); return D;} inline float normalize_len_v5 (Vector & R, Const vector & A) {//todo:use SSE 4.2 dot product intrinsic when Availa    Bleconst __m128 x = _mm_set_ps (1.0f, A.z, A.Y, a.x); const __m128 s = _mm_mul_ps (x, x); Const __M128 T = _MM_ADD_SS (S, _mm_movehl_ps (s)), const __m128 PA = _MM_SQRT_SS (_mm_max_ss (_MM_ADD_SS (t, _mm_shuffle_ps (t, T, 1)), _MM_SET_SS (1.0e-30f)), const __m128 PR = _MM_RCP_SS (PA);//One more ITERATIONR = A * _MM_CVTSS_F32 (_MM_SUB_SS ( _MM_ADD_SS (PR, PR), _MM_MUL_SS (_MM_MUL_SS (PR, PR), PA)); return _mm_cvtss_f32 (PA); struct random{unsigned int state;inline Random (unsigned int seed = 0x9e3779b1) {state = hash (seed);} inline unsigned inT hash (unsigned int a) {a = (A+0X7ED55D16) + (a<<12); a = (a^0xc761c23c) ^ (a>>19); a = (A+0X165667B1) + (A&LT;&L T;5); a = (a+0xd3a2646c) ^ (a<<9); a = (A+0XFD7046C5) + (a<<3); a = (a^0xb55a4f09) ^ (a>>16); return A;} inline float next_float () {state = hash (state), Return (state & 0xFFFFFF) * (1.0f/float (1 << 24));} inline float Next () {return (Next_float () * 1000.0f-500.0f);}}; int Get_time () {return (int) clock ();} int main (int argc, char* argv[]) {const int ntest = 100000000;int Rand_time = 0;{ int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector v;v.x = Random.next (); v.y = Random.next (); v.z = R Andom.next (); sum + = (v.x + v.y + v.z);} Rand_time = Get_time ()-start_time;printf ("random:sum =%f time =%d\n", sum, rand_time);} printf ("Testing performance...\n"); {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.nexT (); v.z = Random.next (); r = Normalize_ref (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-Start_time-rand_time;printf ("Normalize_ref (reference): sum =%f time =%d\n", sum , done_time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); r = Normalize (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize:sum =%f time =%d\n", sum, done_time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); r = Normalize_v1 (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_v1:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next ();V.Y = Random.next (); v.z = Random.next (); r = Normalize_v2 (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-Start_time-rand_time;printf ("Normalize_v2 (FAST): sum =%f time =%d\n", Sum, done _time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); r = Normalize_v3 (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_v3:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len:sum =%f time =%d\n", sum, done_time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = RaNdom.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v1 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v1:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v2 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v2:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v3 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v3:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {VecTor V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v4 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-Start_time-rand_time;printf ("Normalize_len_v4 (FAST): sum =%f time =%d\n", Sum, done _time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v5 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v5:sum =%f time =%d\n", sum, done_time); }printf ("Testing precision...\n"); {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next ()  ; v.z = Random.next (); r1 = Normalize_ref (v); r2 = Normalize_v1 (v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("Normalize_v1:sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next ()  ; v.z = Random.next (); r1 = Normalize_ref (v); r2 = Normalize_v2 (v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("Normalize_v2 (FAST): sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next ()  ; v.z = Random.next (); r1 = Normalize_ref (v); r2 = Normalize_v3 (v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("normalize_v3:sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.neXT (); v.y = Random.next (); v.z = Random.next (); r1 = Normalize_ref (v); Normalize_len_v3 (R2, v); sum + = (r1.x + r1.y + r1.z + R2 . x + r2.y + r2.z); max_error = max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("normalize_len_v3:sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next () ; v.z = Random.next (); r1 = Normalize_ref (v); NORMALIZE_LEN_V4 (R2, v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_er ROR = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("Normalize_len_v4 (FAST): sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next () ; v.z = Random.next (); r1 = Normalize_ref (v); NORMALIZE_LEN_V5 (R2, v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_eRror = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("normalize_len_v5:sum =%f max. Error =%.17f\n ", sum, max_error);} return 0;}

Final conclusion: Normalize_v3 is the most cost-effective. If you use normalize in 3D games or other high-interactivity situations, consider using this fast implementation, which requires only the platform to support SSE 2.0. If you have a low precision requirement, consider using the NORMALIZE_V2 (fast) version of the implementation, which uses the fast open square root algorithm in the Quake 3 game engine.

Originality: The performance and precision of various normalize functions

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.