I checked the uint32sqrt test program packaged by liangbch and found it difficult to further improve the FPU type. However, I barely made some minor changes and provided four versions with isqrt_fpu2_yaos and isqrt_fpu1_lbc.
Code:# Include <stdio. h> <br/> # include <time. h> <br/> typedef unsigned int DWORD; <br/> double B32 [] = {0.0, 4294967296.0}; </P> <p >__ declspec (naked) <br/> DWORD _ fastcall isqrt_fpu2_yaos (dword n) <br/>{< br/>__ ASM <br/>{< br/> push ECx <br/> mov eax, ECx <br/> and eax, 0x80000000 <br/> SHR eax, 31 <br/> define qword PTR [B32 + eax * 8] <br/> fild dword ptr [esp] <br/> faddp ST (1 ), st <br/> fsqrt <br/> sub ESP, 8 <br/> fstp qword PTR [esp] <br/> mov edX, dword ptr [esp + 4] <br/> mov eax, EDX <br/> and EDX, 0x7ff00000 <br/> and eax, 0 xfffff <br/> SHR edX, 20 <br/> or eax, 0x100000 <br/> xchg ECx, EDX <br/> sub ECx, 1043 <br/> neg ECx <br/> SHR eax, CL <br/> xchg edX, ECx <br/> Add ESP, 12 <br/> or ECX, ECx <br/> cmove eax, ECX <br/> RET <br/>}</P> <p> _ declspec (naked) <br/> DWORD fast_sqrt1 (dword x) <br/>{< br/> _ ASM <br/> {<br/> sub ESP, 4 </P> <p> mov dword ptr [esp + 12], 0 <br/> fild qword PTR [esp + 8] <br/> fsqrt <br/> fisttp dword ptr [esp] <br/> mov eax, [esp] <br/> Add ESP, 4 <br/> RET <br/>}</P> <p> _ declspec (naked) <br/> DWORD fast_sqrt2 (dword x) <br/>{< br/> _ ASM <br/>{</P> <p> Add ESP, 12 </P> <p> mov dword ptr [esp-4], 0 <br/> fild qword PTR [esp-8] </P> <p> fsqrt <br/> fstp qword PTR [esp] </P> <p> mov ECx, [esp + 4]; // exponential processing <br/> mov eax, ECx; // processing the ending number <br/> SHR ECx, 20 <br/> and eax, 0 xfffff <br/> sub ESP, 12 <br/> or eax, 0x100000 <br/> test ECx, ECx; // processing 0 <br/> cmove eax, ECx <br/> sub ECx, 1075 <br/> neg ECx <br/> SHR eax, CL </P> <p> RET <br/>}</P> <p> _ declspec (naked) <br/> DWORD fast_sqrt3 (dword x) <br/>{< br/> _ ASM <br/>{</P> <p> Add ESP, 4 </P> <p> mov eax, dword ptr [esp] <br/> and eax, 0x80000000 <br/> SHR eax, 28 <br/> define qword PTR [B32 + eax] <br/> fild dword ptr [esp] <br/> faddp ST (1 ), st </P> <p> fsqrt </P> <p> fstp qword PTR [esp + 4] </P> <p> mov ECx, [esp + 8]; // exponential processing <br/> mov eax, ECx; // processing the ending number <br/> SHR ECx, 20 <br/> and eax, 0 xfffff <br/> sub ESP, 4 <br/> or eax, 0x100000 <br/> test ECx, ECx; // processing 0 <br/> cmove eax, ECx <br/> sub ECx, 1075 <br/> neg ECx <br/> SHR eax, CL </P> <p> RET <br/>}< br/>__ declspec (naked) <br/> DWORD _ fastcall fast_sqrt4 (dword n) <br/>{< br/>__ ASM <br/>{< br/> push ECx <br/> SHR ECx, 31 <br/> define qword PTR [B32 + ECx * 8] <br/> fild dword ptr [esp] <br/> faddp ST (1 ), st <br/> fsqrt </P> <p> fisttp dword ptr [esp] <br/> pop eax <br/> RET <br/>}< br/>} </P> <p> double zero5 = 0.49999999999636 ;; </P> <p >__ declspec (naked) <br/> DWORD _ fastcall isqrt_fpu1_lbc (dword n) <br/>{< br/>__ ASM <br/>{< br/> push ECx </P> <p> SHR ECx, 31 <br/> define qword PTR [B32 + ECx * 8] <br/> fild dword ptr [esp] <br/> faddp ST (1 ), st <br/> fsqrt <br/> fsub qword PTR [zero5] <br/> fistp dword ptr [esp] <br/> pop eax <br/> RET <br/ >}< br/>}</P> <p> int main () <br/>{< br/> double T0, T1; <br/> dword I; </P> <p> printf ("elapsed time:/N "); <br/> // =============================< br/> fast_sqrt1 (0 ); <br/> t0 = clock (); </P> <p> for (I = 0; I <= 0 xfffffff; I ++) // test <br/>{< br/> fast_sqrt1 (I); <br/>}< br/> printf ("fast_sqrt1: % F S/N ", (Clock ()-T0)/clocks_per_sec ); <br/> // =============================< br/> fast_sqrt2 (0 ); <br/> t0 = clock (); </P> <p> for (I = 0; I <= 0 xfffffff; I ++) // test <br/>{< br/> fast_sqrt2 (I); <br/>}< br/> printf ("fast_sqrt2: % F S/N ", (Clock ()-T0)/clocks_per_sec ); <br/> // =============================< br/> isqrt_fpu2_yaos (0 ); <br/> t0 = clock (); <br/> for (I = 0; I <= 0 xfffffff; I ++) // test <br/>{< br/> isqrt_fpu2_yaos (I); <br/>}< br/> printf ("isqrt_fpu2_yaos: % F S/N ", (Clock ()-T0)/clocks_per_sec ); <br/> // =============================< br/> fast_sqrt3 (0 ); <br/> t0 = clock (); <br/> for (I = 0; I <= 0 xfffffff; I ++) // test <br/>{< br/> fast_sqrt3 (I); <br/>}< br/> printf ("fast_sqrt3: % F S/N ", (Clock ()-T0)/clocks_per_sec ); <br/> // =============================< br/> fast_sqrt4 (0 ); <br/> t0 = clock (); <br/> for (I = 0; I <= 0 xfffffff; I ++) // test <br/>{< br/> fast_sqrt4 (I); <br/>}< br/> printf ("fast_sqrt4: % F S/N ", (Clock ()-T0)/clocks_per_sec ); <br/> // =============================< br/> isqrt_fpu1_lbc (0 ); <br/> t0 = clock (); <br/> for (I = 0; I <= 0 xfffffff; I ++) // test <br/>{< br/> isqrt_fpu1_lbc (I); <br/>}< br/> printf ("isqrt_fpu1_lbc: % F S/N ", (Clock ()-T0)/clocks_per_sec ); </P> <p> // ===============================< br /> printf ("/n/nboundary test. /n "); </P> <p> printf (" fast_sqrt1 (0) = % 10u/N ", fast_sqrt1 (0 )); <br/> printf ("fast_sqrt1 (0 xffffffff) = % u/n", fast_sqrt1 (0 xffffffffff )); </P> <p> printf ("fast_sqrt2 (0) = % 10u/N", fast_sqrt2 (0); <br/> printf ("fast_sqrt2 (0 xffffffffff) = % u/n ", fast_sqrt2 (0 xffffffff); </P> <p> printf (" fast_sqrt3 (0) = % 10u/N ", fast_sqrt3 (0); <br/> printf ("fast_sqrt3 (0 xffffffff) = % u/n", fast_sqrt3 (0 xffffffff )); </P> <p> printf ("fast_sqrt4 (0) = % 10u/N", fast_sqrt4 (0); <br/> printf ("fast_sqrt4 (0 xffffffffff) = % u/N ", fast_sqrt4 (0 xffffffff); </P> <p> printf (" isqrt_fpu1_lbc (0) = % 10u/N ", isqrt_fpu1_lbc (0); <br/> printf ("isqrt_fpu1_lbc (0 xffffffff) = % u/N", isqrt_fpu1_lbc (0 xffffffffff )); </P> <p> return 0; </P> <p >}< br/>