The SSE instruction set is introduced on the Internet. Here we post a SSE test program in the vs2008 environment, which is compiled in C ++ code and C ++ inline, c ++ SSE intrinsics three methods of convolution calculation program... this is a Win32 console program .....
Program: http://download.csdn.net/detail/hemmingway/4598506
Code overview of the main file:
// Test_sse.cpp: defines the entry point of the console application. // Calc conversion // # include "stdafx. H "# include <xmmintrin. h> // _ m128 data type and SSE functions # include <float. h> # include <math. h> # include <windows. h> // support odprintf # include <stdarg. h> # include <ctype. h> # include "mmx_ssesupport.h" # include "timecounter. H "# define array_size 100000 # pragma warning (Disable: 4324) // arrays processed by SSE shoshould have 16 bytes alignment :__ declspec (align (16) floa T m_finitialarray [array_size] ;__ declspec (align (16) float m_fresultarray [array_size]; // minimum and maximum values in the result arrayfloat m_fmin; float m_fmax; # define time_start ctimecounter * PT = new ctimecounter () # define time_end Showtime (Pt-> getexecutiontime ()) //////////////////////////////////////// /// // odprintf -- debug functionvoid _ cdecl odprintf (const char * FMT ,...) {char Buf [4096], * P = Buf; va_list ARGs; va_start (ARGs, FMT); P + = vsnprintf_s (p, sizeof (BUF), _ truncate, FMT, ARGs); va_end (ARGs); While (P> Buf & isspace (P [-1]) * -- p = '\ 0 '; * P ++ = '\ R'; * P ++ =' \ n'; * P = '\ 0'; outputdebugstringa (BUF ); // output as ANSI string // outputdebugstring }////////////////////////////// //////////////////////////////////////// ///// show execution time (MS) void Showtime (_ Int64 ntime) {printf ("usage time: % i64d \ n", ntime); // in G ++, the value corresponds to <stdint. h> int64_t, % LLD output should be used }////////////////////////////////// //////////////////////////////////////// // showarray, display array's datavoid showarray (float * parray) {If (! (* Parray) return; float * P = parray; For (INT I = 0; I <array_size; I ++ = 500) // No data is displayed. {printf ("% F", P [I]); if (I = 5) printf ("\ n ");} printf ("\ n ");} //////////////////////////////////////// /// // initarray, fill initial arrayvoid initarray () {m_fmin = flt_max; m_fmax = flt_min; float F; int I; for (I = 0; I <array_size; I ++) {// fill array with one sin cycle and ensur E that all values are positive // (to use SQRT in conversion) F = (float) sin (double) I * 6.29/array_size) + 2.0f; if (F <m_fmin) m_fmin = f; If (F> m_fmax) m_fmax = f; m_finitialarray [I] = f;} showarray (m_finitialarray );} //////////////////////////////////////// /// // make conversion using C + code // each initial array member is converted to result array member // Using some formula (just to demonstrate SSE features ). // minimum and maximum result values are calculated and shown. /// function also calculates and shows conversion time (MS ). // void oncplusplus () {time_start; m_fmin = flt_max; m_fmax = flt_min; int I; for (I = 0; I <array_size; I ++) {m_fresultarray [I] = SQRT (m_finitialarray [I] * 2.8f); If (m_fresultarray [I] <m_fmin) m_fmin = m_fresultarray [I]; If (M_fresultarray [I]> m_fmax) m_fmax = m_fresultarray [I];} time_end; showarray (m_fresultarray );} //////////////////////////////////////// /// // onsseassembly, make conversion using C ++ code with Inline assemblyvoid onsseassembly () {time_start; float * pin = m_finitialarray; float * pout = m_fresultarray; float F = 2.8f; float flt_min = flt_min; float flt_max = flt_max ;__ m128 min12 8 ;__ m128 max128; // Using Additional registers: // xmm2-Multiplication Coefficient // xmm3-Minimum // xmm4-maximum_asm {movss xmm2, f // xmm2 [0] = 2.8 shufps xmm2, xmm2, 0 // xmm2 [1, 2, 3] = xmm2 [0] movss xmm3, flt_max // xmm3 = flt_maxshufps xmm3, xmm3, 0 // xmm3 [1, 2, 3] = xmm3 [0] movss xmm4, flt_min // xmm4 = flt_minshufps xmm4, xmm4, 0 // xmm3 [1, 2, 3] = xmm3 [0] mov ESI, pin // input pointermov E Di, pout // output pointermov ECx, array_size/4 // loop counterstart_loop: movaps xmm1, [esi] // xmm1 = [esi] mulps xmm1, xmm2 // xmm1 = xmm1 * xmm2sqrtps xmm1, xmm1 // xmm1 = SQRT (xmm1) movaps [EDI], xmm1 // [EDI] = xmm1minps xmm3, xmm1maxps xmm4, xmm1add ESI, 16add EDI, 16dec ecxjnz start_loopmovaps min128, xmm3movaps max128, xmm4} // extract minimum and maximum values from min128 and max128union U {_ M128 m; float f [4];} X; X. M = min128; m_fmin = min (X. f [0], min (X. f [1], min (X. f [2], X. f [3]); X. M = max128; m_fmax = max (X. f [0], max (X. f [1], max (X. f [2], X. f [3]); time_end; showarray (m_fresultarray );} //////////////////////////////////////// /// // onssecpp, make conversion using C ++ code with SSE intrinsicsvoid onssecpp () {time_start ;__ m128 coeff = _ mm_set_ps1 (2.8f); // COE FF [0, 1, 2, 3] = 2.8 _ m128 TMP ;__ m128 min128 = _ mm_set_ps1 (flt_max); // min128 [0, 1, 2, 3] = flt_max _ m128 max128 = _ mm_set_ps1 (flt_min); // max128 [0, 1, 2, 3] = flt_min _ m128 * psource = (_ m128 *) m_finitialarray; __m128 * pdest = (_ m128 *) m_fresultarray; For (INT I = 0; I <array_size/4; I ++) {TMP = _ mm_mul_ps (* psource, coeff); // TMP = * psource * coeff * pdest = _ mm_sqrt_ps (TMP ); // * pdest = SQRT (T MP) min128 = _ mm_min_ps (* pdest, min128); max128 = _ mm_max_ps (* pdest, max128); psource ++; pdest ++ ;} // extract minimum and maximum values from min128 and max128union U {__ m128 m; float f [4] ;}x; X. M = min128; m_fmin = min (X. f [0], min (X. f [1], min (X. f [2], X. f [3]); X. M = max128; m_fmax = max (X. f [0], max (X. f [1], max (X. f [2], X. f [3]); time_end; showarray (m_fresultarray);} int _ tmain (INT argc, _ tchar * argv []) {// te St SSE support? Bool bmmx, BSSE; testfeatures (& bmmx, & BSSE); If (! BSSE) {// do not support sseodprintf ("do not support SSE. \ n "); Return 0;} odprintf (" everything is OK... "); // first, prepare dataprintf (" program generate % d floating point (not all data are displayed )... \ n ", array_size); initarray (); // second, make conversion using C ++ codegetchar (); printf ("make conversion using C ++ Code \ n"); oncplusplus (); // third, make conversion using C ++ code with Inline assemblygetchar (); printf ("make conversion using C ++ code with inline assembly \ n"); onsseassembly (); // finally, make conversion using C ++ code with SSE intrinsics getchar (); printf ("make conversion using C ++ code with SSE intrinsics \ n"); onssecpp (); getchar (); Return 0 ;}