關於下面兩段代碼的解釋:
引自水木Maling:
"寫雖然是16bytes/cycle但那是througput 也就是說我們需要儘可能使每個cycle有一個寫操作進入write buffer after allocation stage從而保證 16bytes/cycle. 上面的代碼palignr 在前端阻擋(從讀取到解碼階段)了寫操作儘快的進入decode以及allocatoin stage 所以每個迴圈會比下面的代碼多出 2~3個cycle",該文章全部內容參見最後的參考閱讀。
我將Maling的代碼做了一個實現,但由於我實驗機不支援SSE3指令,在運行palignr指令會出非法操作,dmesg給出:trap invalid opcode rip:4004cc rsp:7fff74e5b728 error:0,這樣的錯誤,因此改成了三段指令拼的方法。
理解這段代碼的功能並不複雜,但為什麼快的原因,就是Maling說的那幾條,我們可以看出在亂序後效能的提升,這是一個很好的例子,我把它引在了我的系列中,代碼和原理來自Maling,我只是封裝了下實現,便於大家實際運行,並體驗。
#include<stdio.h>
#include<stdlib.h>
#include <stddef.h>
#include <stdint.h>
asm(" .text ");
asm(" .type shl_7, @function ");
asm("shl_7: push %rbp ");
asm(" mov %rsp,%rbp ");
asm("loop: sub $0x80, %rdx ");
asm(" movaps -0x07(%rsi), %xmm1 ");
asm(" movaps 0x09(%rsi), %xmm2 ");
asm(" movaps 0x19(%rsi), %xmm3 ");
asm(" movaps 0x29(%rsi), %xmm4 ");
asm(" movaps 0x39(%rsi), %xmm5 ");
asm(" movaps 0x49(%rsi), %xmm6 ");
asm(" movaps 0x59(%rsi), %xmm7 ");
asm(" movaps 0x69(%rsi), %xmm8 ");
asm(" movaps 0x79(%rsi), %xmm9 ");
asm(" lea 0x80(%rsi), %rsi ");
asm(" psrldq $7, %xmm8 ");
asm(" pslldq $9, %xmm9 ");
asm(" por %xmm8, %xmm9 ");
asm(" psrldq $7, %xmm7 ");
asm(" pslldq $9, %xmm8 ");
asm(" por %xmm7, %xmm8 ");
asm(" psrldq $7, %xmm6 ");
asm(" pslldq $9, %xmm7 ");
asm(" por %xmm6, %xmm7 ");
asm(" psrldq $7, %xmm5 ");
asm(" pslldq $9, %xmm6 ");
asm(" por %xmm5, %xmm6 ");
asm(" psrldq $7, %xmm4 ");
asm(" pslldq $9, %xmm5 ");
asm(" por %xmm4, %xmm5 ");
asm(" psrldq $7, %xmm3 ");
asm(" pslldq $9, %xmm4 ");
asm(" por %xmm3, %xmm4 ");
asm(" psrldq $7, %xmm2 ");
asm(" pslldq $9, %xmm3 ");
asm(" por %xmm2, %xmm3 ");
asm(" psrldq $7, %xmm1 ");
asm(" pslldq $9, %xmm2 ");
asm(" por %xmm1, %xmm2 ");
asm(" movaps %xmm9, 0x70(%rdi) ");
asm(" movaps %xmm8, 0x60(%rdi) ");
asm(" movaps %xmm7, 0x50(%rdi) ");
asm(" movaps %xmm6, 0x40(%rdi) ");
asm(" movaps %xmm5, 0x30(%rdi) ");
asm(" movaps %xmm4, 0x20(%rdi) ");
asm(" movaps %xmm3, 0x10(%rdi) ");
asm(" movaps %xmm2, 0x0(%rdi) ");
asm(" leaveq ");
asm(" retq ");
asm(" .text ");
asm(" .type shl_7_f, @function ");
asm("shl_7_f: push %rbp ");
asm(" mov %rsp,%rbp ");
asm("loop_f: sub $0x80, %rdx ");
asm(" movaps -0x07(%rsi), %xmm1 ");
asm(" movaps 0x09(%rsi), %xmm2 ");
asm(" movaps 0x19(%rsi), %xmm3 ");
asm(" movaps 0x29(%rsi), %xmm4 ");
asm(" movaps 0x39(%rsi), %xmm5 ");
asm(" movaps 0x49(%rsi), %xmm6 ");
asm(" movaps 0x59(%rsi), %xmm7 ");
asm(" movaps 0x69(%rsi), %xmm8 ");
asm(" movaps 0x79(%rsi), %xmm9 ");
asm(" lea 0x80(%rsi), %rsi ");
asm(" psrldq $7, %xmm8 ");
asm(" pslldq $9, %xmm9 ");
asm(" por %xmm8, %xmm9 ");
asm(" movaps %xmm9, 0x70(%rdi) ");
asm(" psrldq $7, %xmm7 ");
asm(" pslldq $9, %xmm8 ");
asm(" por %xmm7, %xmm8 ");
asm(" movaps %xmm8, 0x60(%rdi) ");
asm(" psrldq $7, %xmm6 ");
asm(" pslldq $9, %xmm7 ");
asm(" por %xmm6, %xmm7 ");
asm(" movaps %xmm7, 0x50(%rdi) ");
asm(" psrldq $7, %xmm5 ");
asm(" pslldq $9, %xmm6 ");
asm(" por %xmm5, %xmm6 ");
asm(" movaps %xmm6, 0x40(%rdi) ");
asm(" psrldq $7, %xmm4 ");
asm(" pslldq $9, %xmm5 ");
asm(" por %xmm4, %xmm5 ");
asm(" movaps %xmm5, 0x30(%rdi) ");
asm(" psrldq $7, %xmm3 ");
asm(" pslldq $9, %xmm4 ");
asm(" por %xmm3, %xmm4 ");
asm(" movaps %xmm4, 0x20(%rdi) ");
asm(" psrldq $7, %xmm2 ");
asm(" pslldq $9, %xmm3 ");
asm(" por %xmm2, %xmm3 ");
asm(" movaps %xmm3, 0x10(%rdi) ");
asm(" psrldq $7, %xmm1 ");
asm(" pslldq $9, %xmm2 ");
asm(" por %xmm1, %xmm2 ");
asm(" movaps %xmm2, 0x0(%rdi) ");
asm(" leaveq ");
asm(" retq ");
int main(void)
{
uint8_t * src = (uint8_t*)malloc(sizeof(uint8_t)*(128+7));
uint8_t * des = (uint8_t*)malloc(sizeof(uint8_t)*128);
int i = 0;
for(;i<128;++i)
{
src[i] = 0xFF;
}
src[0] = 0x0;
src[1]=0x1;
src[2]=0x2;
src[7]=0x7;
src[8]=0x8;
src+=7;
/*shl_7(des,src);
i = 0;
for(;i<128;++i)
{
printf("%d/n",des[i]);
}
i = 0;*/
for(;i<100000000;++i)
{
#ifdef SLOW
shl_7(des,src);
#endif
#ifdef FAST
shl_7_f(des,src);
#endif
}
return 0;
}
參見:
(1)http://www.newsmth.net/bbscon.php?bid=272&id=47206