Originality: The performance and precision of various normalize functions

Last Update:2015-07-12 Source: Internet

Author: User

Tags square root

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Performance Benchmarking Program for Various normalize functions////by Elvic liang///////////////////////////////////////////////////#include <math.h > #include <xmmintrin.h> #include <time.h>struct vector{float x, y, Z;inline vector () {}inline vector ( Float _x, float _y, float _z): X (_x), Y (_y), Z (_z) {}inline Vector operator * (float RHS) Const{vector temp;temp.x = x * rhs;temp.y = y * rhs;temp.z = z * Rhs;return temp;}}; Template <typename t>inline t max (t A, T B) {return ((a > B)? a:b);} inline float RCPF (float x) {#ifdef _msc_verreturn 1.0f/x; #elseconst __m128 a = _MM_SET_SS (x); const __m128 r = _MM_RCP_SS ( a);//One more Iterationreturn _mm_cvtss_f32 (_MM_SUB_SS (_MM_ADD_SS (R, R), _mm_mul_ss (_mm_mul_ss (R, R), a))); #endif} inline float invsqrtf (float x) {const __m128 a = _MM_MAX_SS (_MM_SET_SS (x), _MM_SET_SS (1.0e-30f)); Const __m128 r = _mm_rsqrt _SS (a);//One more Iterationreturn _MM_CVTSS_F32 (_MM_MUL_SS (R, _mm_add_ss (_MM_SET_SS (1.5f), _mm_mul_ss (_mm_mul_ss (A, _MM_SET_SS ( -0.5f)), _mm_mul_ss (R, R)))) );} inline float fastinvsqrt (float x) {Float xhalf = 0.5f * x;int i = * (int *) &x;i = 0x5f3759df-(i >> 1); x = * (Floa T *) &i;x = x * (1.5f-xhalf * x * x); return x;} inline float fastsqrt (float x) {union {int intpart;float Floatpart;} convertor;union {int intpart;float floatpart;} conver Tor2;convertor.floatpart = X;convertor2.floatpart = X;convertor.intpart = 0x1fbcf800 + (Convertor.intPart >> 1); Convertor2.intpart = 0x5f3759df-(convertor2.intpart >> 1); return 0.5f * (Convertor.floatpart + (x * Convertor2.flo Atpart));} inline float dot (const vector & A, const vector & B) {return (a.x * b.x + a.y * b.y + a.z * b.z);} inline float len (const Vector & a) {const float L = dot (A, a); return Sqrtf (Max (0.0f, L));} Inline vector normalize_ref (const vector & a) {Float length = sqrtf (max (0.0f, a.x * a.x + a.y * a.y + a.z * a.z));//Us ing division gives higher precision than multiplying (1/length) return Vector (A.x/length, A.y/length, a.z/length);} Inline vector Normalize (const vector & a) {return a * INVSQRTF (dot (A, a));} Inline vector normalize_v1 (const vector & a) {const __m128 PA = _MM_MAX_SS (_MM_SET_SS (a.x * a.x + a.y * a.y + a.z * a.z ), _MM_SET_SS (1.0e-30f)), const __m128 r = _MM_RSQRT_SS (PA);//One more iterationconst float d = _mm_cvtss_f32 (_MM_MUL_SS (R , _mm_add_ss (_MM_SET_SS (1.5f), _mm_mul_ss (_MM_MUL_SS (PA, _mm_set_ss ( -0.5f)), _mm_mul_ss (R, R)))); return a * D;} Inline vector normalize_v2 (const vector & a) {return a * FASTINVSQRT (dot (A, a));}  Inline vector normalize_v3 (const vector & A) {//todo:use SSE 4.2 dot product intrinsic when availableconst __m128 x =    _mm_set_ps (1.0f, A.z, A.Y, a.x); const __m128 s = _mm_mul_ps (x, x); Const __M128 T = _MM_ADD_SS (S, _mm_movehl_ps (s)), const __m128 PA = _MM_MAX_SS (_mm_add_ss (T, _mm_shuffle_ps (T, T, 1)), _ MM_SET_SS (1.0e-30f)); Const __m128 r = _MM_RSQRT_SS (PA);//One MORe Iterationreturn A * _MM_CVTSS_F32 (_MM_MUL_SS (R, _mm_add_ss (_MM_SET_SS (1.5f), _mm_mul_ss (_MM_MUL_SS (PA, _MM_SET_SS ( -0.5f))), _mm_mul_ss (R, R)))));  inline float Normalize_len (Vector & R, Const vector & a) {const float L = len (a); Const float d = max (L, 1.0e-30f); r = A * RCPF (d); return D;} inline float normalize_len_v1 (Vector & R, Const vector & a) {const float d = sqrtf (max (1.0e-30f, a.x * a.x + a.y * A.Y + a.z * a.z)); r = A * RCPF (d); return D;} inline float normalize_len_v2 (Vector & R, Const vector & a) {const float d = sqrtf (max (1.0e-30f, a.x * a.x + a.y * A.Y + a.z * a.z)); Const __m128 PA = _MM_SET_SS (d); const __M128 PR = _MM_RCP_SS (PA);//One more iterationconst Float rd = _ MM_CVTSS_F32 (_MM_SUB_SS (PR, PR), _MM_MUL_SS (_MM_MUL_SS (PR, PR), PA))); r = A * Rd;return D;} inline float normalize_len_v3 (Vector & R, Const vector & a) {const __m128 PA = _MM_SQRT_SS (_mm_max_ss (_MM_SET_SS (1). 0e-30f), _MM_SET_SS (a.x * a.x + a.y * a.y + a.z * a.z)); const __M128 Pr = _MM_RCP_SS (PA);//One more iterationconst float rd = _mm_cvtss_f32 (_MM_SUB_SS (_MM_ADD_SS (PR, PR), _MM_MUL_SS (_mm_mul_ SS (PR, PR), PA))); r = A * Rd;return _mm_cvtss_f32 (PA);}  inline float normalize_len_v4 (Vector & R, Const vector & a) {const float d = fastsqrt (max (1.0e-30f, a.x * a.x + A.Y * A.Y + a.z * a.z)); r = A * RCPF (d); return D;} inline float normalize_len_v5 (Vector & R, Const vector & A) {//todo:use SSE 4.2 dot product intrinsic when Availa    Bleconst __m128 x = _mm_set_ps (1.0f, A.z, A.Y, a.x); const __m128 s = _mm_mul_ps (x, x); Const __M128 T = _MM_ADD_SS (S, _mm_movehl_ps (s)), const __m128 PA = _MM_SQRT_SS (_mm_max_ss (_MM_ADD_SS (t, _mm_shuffle_ps (t, T, 1)), _MM_SET_SS (1.0e-30f)), const __m128 PR = _MM_RCP_SS (PA);//One more ITERATIONR = A * _MM_CVTSS_F32 (_MM_SUB_SS ( _MM_ADD_SS (PR, PR), _MM_MUL_SS (_MM_MUL_SS (PR, PR), PA)); return _mm_cvtss_f32 (PA); struct random{unsigned int state;inline Random (unsigned int seed = 0x9e3779b1) {state = hash (seed);} inline unsigned inT hash (unsigned int a) {a = (A+0X7ED55D16) + (a<<12); a = (a^0xc761c23c) ^ (a>>19); a = (A+0X165667B1) + (A&LT;&L T;5); a = (a+0xd3a2646c) ^ (a<<9); a = (A+0XFD7046C5) + (a<<3); a = (a^0xb55a4f09) ^ (a>>16); return A;} inline float next_float () {state = hash (state), Return (state & 0xFFFFFF) * (1.0f/float (1 << 24));} inline float Next () {return (Next_float () * 1000.0f-500.0f);}}; int Get_time () {return (int) clock ();} int main (int argc, char* argv[]) {const int ntest = 100000000;int Rand_time = 0;{ int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector v;v.x = Random.next (); v.y = Random.next (); v.z = R Andom.next (); sum + = (v.x + v.y + v.z);} Rand_time = Get_time ()-start_time;printf ("random:sum =%f time =%d\n", sum, rand_time);} printf ("Testing performance...\n"); {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.nexT (); v.z = Random.next (); r = Normalize_ref (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-Start_time-rand_time;printf ("Normalize_ref (reference): sum =%f time =%d\n", sum , done_time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); r = Normalize (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize:sum =%f time =%d\n", sum, done_time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); r = Normalize_v1 (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_v1:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next ();V.Y = Random.next (); v.z = Random.next (); r = Normalize_v2 (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-Start_time-rand_time;printf ("Normalize_v2 (FAST): sum =%f time =%d\n", Sum, done _time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); r = Normalize_v3 (v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_v3:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len:sum =%f time =%d\n", sum, done_time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = RaNdom.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v1 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v1:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v2 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v2:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v3 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v3:sum =%f time =%d\n", sum, done_time); }{int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {VecTor V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v4 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-Start_time-rand_time;printf ("Normalize_len_v4 (FAST): sum =%f time =%d\n", Sum, done _time);} {int start_time = Get_time (); Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r;v.x = Random.next (); v.y = Random.next (); v.z = Random.next (); Normalize_len_v5 (R, v); sum + = (r.x + r.y + r.z);} int done_time = Get_time ()-start_time-rand_time;printf ("normalize_len_v5:sum =%f time =%d\n", sum, done_time); }printf ("Testing precision...\n"); {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next ()  ; v.z = Random.next (); r1 = Normalize_ref (v); r2 = Normalize_v1 (v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("Normalize_v1:sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next ()  ; v.z = Random.next (); r1 = Normalize_ref (v); r2 = Normalize_v2 (v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("Normalize_v2 (FAST): sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next ()  ; v.z = Random.next (); r1 = Normalize_ref (v); r2 = Normalize_v3 (v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("normalize_v3:sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.neXT (); v.y = Random.next (); v.z = Random.next (); r1 = Normalize_ref (v); Normalize_len_v3 (R2, v); sum + = (r1.x + r1.y + r1.z + R2 . x + r2.y + r2.z); max_error = max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("normalize_len_v3:sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next () ; v.z = Random.next (); r1 = Normalize_ref (v); NORMALIZE_LEN_V4 (R2, v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_er ROR = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("Normalize_len_v4 (FAST): sum =%f max. Error =%.17f\n ", sum, max_error);} {Float max_error = 0.0f; Random random;double sum = 0.0;for (int i = 0; i < ntest; ++i) {Vector V, r1, r2;v.x = Random.next (); v.y = Random.next () ; v.z = Random.next (); r1 = Normalize_ref (v); NORMALIZE_LEN_V5 (R2, v); sum + = (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_eRror = Max (FABSF (r1.x-r2.x), Max (FABSF (R1.Y-R2.Y), FABSF (R1.Z-R2.Z)));} printf ("normalize_len_v5:sum =%f max. Error =%.17f\n ", sum, max_error);} return 0;}

Final conclusion: Normalize_v3 is the most cost-effective. If you use normalize in 3D games or other high-interactivity situations, consider using this fast implementation, which requires only the platform to support SSE 2.0. If you have a low precision requirement, consider using the NORMALIZE_V2 (fast) version of the implementation, which uses the fast open square root algorithm in the Quake 3 game engine.

Originality: The performance and precision of various normalize functions

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More