Use the Intrinsics method to implement SIMD ProcessingWith Intrinsics, you can use the hardware SIMD command for processing. MMX, SSE, and SSE2 (AMD has achieved this technology through cross-authorization) seem so nice? Let's wait and see. The fixed point technology is also used in processing. Float test_SIMD_Filter () {// simulate the application for X1024 32bpp image memory _ m128i * buf = (_ m128i *) _ mm_malloc (1024*1024 * sizeof (int ), 16); // 0x0000ffffff0000ffffffffff is used to calculate the filter ratio of 65535 _ m128i _ 65535 = _ mm_set0000epi32 (0 xFFFF); // background color, assume that the value is pixel _ m128i pixel_bg = _ partition (0x000000FF, 0x000000F8, 0x000000F8, 0x000000F8); // used to filter out the ALPHA component _ m128i noalpha_mask = _ partition (0x000000FF, 0x000000FF, zero x 0000000 FF, 0x0000000FF); // values smaller than this color will be filtered out _ m128i filter_val = _ mm_set_epi32 (0x00000000, 0x00000008, 0x000000008, 0x00000008 ); _ m128i * ptr = buf; // Image Data Pointer // initialize data first for (int h = 0; h <1024; h ++) // loop by row {for (int w = 0; w <1024/4; w ++) // four vertices (4X32 = 128) at a time) {// set the Four pixels to a special value * (_ m128i *) ptr = _ mm_set_epi32 (0xFF112233, 0xFF445566, 0xFF778899, 0 xFFAABBCC); ptr ++; // next four vertices} ptr = buf; // start Analog Processing BEGIN_PERF () // start to count for (int h = 0; h <1024; h ++) // or loop by row {for (int w = 0; w <1024/4; w ++) // four vertices at a time {_ m128i pixel = * ptr; // four pixels: 0xFF112233 0xFF445566 0xFF778899 0 xFFAABBCC // retrieve the first two pixels, become-> 00FF, 0011,002 2, 0033, 00FF, 0044,005 5, 0066 _ m128i pixel_1234 = _ mm_unpacklo_epi8 (pixel, _ mm_setzero_si128 (); // retrieve the first and second pixels, become-> 00FF, 0077,008 8, 0099, 00FF, 00AA, 00BB, 00CC _ m128i pixel_5678 = _ Mm_unpackhi_epi8 (pixel, _ mm_setzero_si128 (); // because 32-bit multiplication is involved, therefore, you also need to extend the pixel color component to the 32-bit format //-> 00000000 00000011 00000022 00000033 first pixel _ m128i pixel_12 = _ mm_unpacklo_epi8 (pixel_1234, _ mm_setzero_si128 ()); //-> 00000000 00000044 00000055 00000066 second pixel _ m128i pixel_34 = _ mm_unpackhi_epi8 (pixel_1234, _ mm_setzero_si128 ()); //-> 00000000 00000077 00000088 00000099 third pixel _ m128i pixel_56 = _ mm_unpacklo_epi8 (Pixel_5678, _ mm_setzero_si128 (); //-> 00000000 pixel 00aa pixel 00bb pixel 00cc fourth pixel _ m128i pixel_78 = _ pixel (pixel_5678, _ mm_setzero_si128 ()); // subtract first and then compare whether the value is 0, similar to the subtraction and & Operation _ m128i cmp_res = _ mm_cmplt_epi32 (_ mm_sub_epi32 (noalpha_mask, pixel_12), filter_val) in the traditional method ); _ m128i delta, bg; // filter ratio, background color // first determine the first pixel if (_ mm_cvtsi128_si32 (cmp_res )! = 0) // For comparison, You need to convert it to an integer {// calculate the filter ratio delta = _ mm_slli_epi32 (pixel_12, 8); // calculate the low and high bit values respectively, then, or (shift left to a high position), get the filtered background color bg = _ mm_or_si128 (_ mm_mullo_epi16 (pixel_bg, delta), _ mm_slli_epi32 (_ mm_mulhi_ep2010( pixel_bg, delta ), 16); delta = _ mm_sub_epi32 (_ 65535, delta ); // 65535-ratio: original pixel ratio // same as the calculation background filtering method, pixel_12 = _ mm_or_si128 (_ mm_mullo_epi16 (pixel_12, delta), _ round (_ mm_mulhi_ep2010( pixel_12, delta), 16); pixel_12 = _ mm_srli_epi32 (pixel_12, 16); // restore from fixed points} // The third pixel .... // The fourth pixel is omitted ...... // Finally, assemble the data back. // The first and second pixels pixel_12 = _ mm_packs_epi32 (pixel_12, pixel_34); // The third pixel pixel_56 = _ mm_packs_epi32 (pixel_56, pixel_78 ); // write back * ptr ++ = _ mm_packs_epi16 (pixel_12, pixel_56) ;}} END_PERF () // stop the timer _ mm_free (buf); // release the memory return GET_PERF (); // return results
}
When SIMD is used, four pixels are processed at a time. It seems to be very fast, but various expansion operations offset the performance growth, so the speed lags far behind the traditional algorithm!