SSE instruction set

Source: Internet
Author: User

The SSE instruction set is introduced on the Internet. Here we post a SSE test program in the vs2008 environment, which is compiled in C ++ code and C ++ inline, c ++ SSE intrinsics three methods of convolution calculation program... this is a Win32 console program .....


Program: http://download.csdn.net/detail/hemmingway/4598506


Code overview of the main file:

// Test_sse.cpp: defines the entry point of the console application. // Calc conversion // # include "stdafx. H "# include <xmmintrin. h> // _ m128 data type and SSE functions # include <float. h> # include <math. h> # include <windows. h> // support odprintf # include <stdarg. h> # include <ctype. h> # include "mmx_ssesupport.h" # include "timecounter. H "# define array_size 100000 # pragma warning (Disable: 4324) // arrays processed by SSE shoshould have 16 bytes alignment :__ declspec (align (16) floa T m_finitialarray [array_size] ;__ declspec (align (16) float m_fresultarray [array_size]; // minimum and maximum values in the result arrayfloat m_fmin; float m_fmax; # define time_start ctimecounter * PT = new ctimecounter () # define time_end Showtime (Pt-> getexecutiontime ()) //////////////////////////////////////// /// // odprintf -- debug functionvoid _ cdecl odprintf (const char * FMT ,...) {char Buf [4096], * P = Buf; va_list ARGs; va_start (ARGs, FMT); P + = vsnprintf_s (p, sizeof (BUF), _ truncate, FMT, ARGs); va_end (ARGs); While (P> Buf & isspace (P [-1]) * -- p = '\ 0 '; * P ++ = '\ R'; * P ++ =' \ n'; * P = '\ 0'; outputdebugstringa (BUF ); // output as ANSI string // outputdebugstring }////////////////////////////// //////////////////////////////////////// ///// show execution time (MS) void Showtime (_ Int64 ntime) {printf ("usage time: % i64d \ n", ntime); // in G ++, the value corresponds to <stdint. h> int64_t, % LLD output should be used }////////////////////////////////// //////////////////////////////////////// // showarray, display array's datavoid showarray (float * parray) {If (! (* Parray) return; float * P = parray; For (INT I = 0; I <array_size; I ++ = 500) // No data is displayed. {printf ("% F", P [I]); if (I = 5) printf ("\ n ");} printf ("\ n ");} //////////////////////////////////////// /// // initarray, fill initial arrayvoid initarray () {m_fmin = flt_max; m_fmax = flt_min; float F; int I; for (I = 0; I <array_size; I ++) {// fill array with one sin cycle and ensur E that all values are positive // (to use SQRT in conversion) F = (float) sin (double) I * 6.29/array_size) + 2.0f; if (F <m_fmin) m_fmin = f; If (F> m_fmax) m_fmax = f; m_finitialarray [I] = f;} showarray (m_finitialarray );} //////////////////////////////////////// /// // make conversion using C + code // each initial array member is converted to result array member // Using some formula (just to demonstrate SSE features ). // minimum and maximum result values are calculated and shown. /// function also calculates and shows conversion time (MS ). // void oncplusplus () {time_start; m_fmin = flt_max; m_fmax = flt_min; int I; for (I = 0; I <array_size; I ++) {m_fresultarray [I] = SQRT (m_finitialarray [I] * 2.8f); If (m_fresultarray [I] <m_fmin) m_fmin = m_fresultarray [I]; If (M_fresultarray [I]> m_fmax) m_fmax = m_fresultarray [I];} time_end; showarray (m_fresultarray );} //////////////////////////////////////// /// // onsseassembly, make conversion using C ++ code with Inline assemblyvoid onsseassembly () {time_start; float * pin = m_finitialarray; float * pout = m_fresultarray; float F = 2.8f; float flt_min = flt_min; float flt_max = flt_max ;__ m128 min12 8 ;__ m128 max128; // Using Additional registers: // xmm2-Multiplication Coefficient // xmm3-Minimum // xmm4-maximum_asm {movss xmm2, f // xmm2 [0] = 2.8 shufps xmm2, xmm2, 0 // xmm2 [1, 2, 3] = xmm2 [0] movss xmm3, flt_max // xmm3 = flt_maxshufps xmm3, xmm3, 0 // xmm3 [1, 2, 3] = xmm3 [0] movss xmm4, flt_min // xmm4 = flt_minshufps xmm4, xmm4, 0 // xmm3 [1, 2, 3] = xmm3 [0] mov ESI, pin // input pointermov E Di, pout // output pointermov ECx, array_size/4 // loop counterstart_loop: movaps xmm1, [esi] // xmm1 = [esi] mulps xmm1, xmm2 // xmm1 = xmm1 * xmm2sqrtps xmm1, xmm1 // xmm1 = SQRT (xmm1) movaps [EDI], xmm1 // [EDI] = xmm1minps xmm3, xmm1maxps xmm4, xmm1add ESI, 16add EDI, 16dec ecxjnz start_loopmovaps min128, xmm3movaps max128, xmm4} // extract minimum and maximum values from min128 and max128union U {_ M128 m; float f [4];} X; X. M = min128; m_fmin = min (X. f [0], min (X. f [1], min (X. f [2], X. f [3]); X. M = max128; m_fmax = max (X. f [0], max (X. f [1], max (X. f [2], X. f [3]); time_end; showarray (m_fresultarray );} //////////////////////////////////////// /// // onssecpp, make conversion using C ++ code with SSE intrinsicsvoid onssecpp () {time_start ;__ m128 coeff = _ mm_set_ps1 (2.8f); // COE FF [0, 1, 2, 3] = 2.8 _ m128 TMP ;__ m128 min128 = _ mm_set_ps1 (flt_max); // min128 [0, 1, 2, 3] = flt_max _ m128 max128 = _ mm_set_ps1 (flt_min); // max128 [0, 1, 2, 3] = flt_min _ m128 * psource = (_ m128 *) m_finitialarray; __m128 * pdest = (_ m128 *) m_fresultarray; For (INT I = 0; I <array_size/4; I ++) {TMP = _ mm_mul_ps (* psource, coeff); // TMP = * psource * coeff * pdest = _ mm_sqrt_ps (TMP ); // * pdest = SQRT (T MP) min128 = _ mm_min_ps (* pdest, min128); max128 = _ mm_max_ps (* pdest, max128); psource ++; pdest ++ ;} // extract minimum and maximum values from min128 and max128union U {__ m128 m; float f [4] ;}x; X. M = min128; m_fmin = min (X. f [0], min (X. f [1], min (X. f [2], X. f [3]); X. M = max128; m_fmax = max (X. f [0], max (X. f [1], max (X. f [2], X. f [3]); time_end; showarray (m_fresultarray);} int _ tmain (INT argc, _ tchar * argv []) {// te St SSE support? Bool bmmx, BSSE; testfeatures (& bmmx, & BSSE); If (! BSSE) {// do not support sseodprintf ("do not support SSE. \ n "); Return 0;} odprintf (" everything is OK... "); // first, prepare dataprintf (" program generate % d floating point (not all data are displayed )... \ n ", array_size); initarray (); // second, make conversion using C ++ codegetchar (); printf ("make conversion using C ++ Code \ n"); oncplusplus (); // third, make conversion using C ++ code with Inline assemblygetchar (); printf ("make conversion using C ++ code with inline assembly \ n"); onsseassembly (); // finally, make conversion using C ++ code with SSE intrinsics getchar (); printf ("make conversion using C ++ code with SSE intrinsics \ n"); onssecpp (); getchar (); Return 0 ;}


Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.