Memchr SSE Acceleration

Source: Internet
Author: User

Memchr_sse.s

.text.globl memchr_sse; .align 4,0x90;memchr_sse:movd%rsi, %xmm1mov%rdi, %rcxpunpcklbw %xmm1, %xmm1test%rdx, %rdxjzL_return_nullpunpcklbw %xmm1, %xmm1and$63, %rcxpshufd$0, %xmm1, %xmm1cmp$48, %rcxjaL_crosscachemovdqu(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matches_1sub$16, %rdxjbeL_return_nulladd$16, %rdiand$15, %rcxand$-16, %rdiadd%rcx, %rdxsub$64, %rdxjbeL_exit_loopjmpL_loop_prolog.p2align 4L_crosscache:and$15, %rcxand$-16, %rdimovdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0/* Check if there is a match.  */pmovmskb %xmm0, %eax/* Remove the leading bytes.  */sar%cl, %eaxtest%eax, %eaxjeL_unaligned_no_match/* Check which byte is a match.  */bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulladd%rdi, %raxadd%rcx, %raxret.p2align 4L_unaligned_no_match:add%rcx, %rdxsub$16, %rdxjbeL_return_nulladd$16, %rdisub$64, %rdxjbeL_exit_loop.p2align 4L_loop_prolog:movdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matchesmovdqa16(%rdi), %xmm2pcmpeqb%xmm1, %xmm2pmovmskb %xmm2, %eaxtest%eax, %eaxjnzL_matches16movdqa32(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pmovmskb %xmm3, %eaxtest%eax, %eaxjnzL_matches32movdqa48(%rdi), %xmm4pcmpeqb%xmm1, %xmm4add$64, %rdipmovmskb %xmm4, %eaxtest%eax, %eaxjnzL_matches0test$0x3f, %rdijzL_align64_loopsub$64, %rdxjbeL_exit_loopmovdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matchesmovdqa16(%rdi), %xmm2pcmpeqb%xmm1, %xmm2pmovmskb %xmm2, %eaxtest%eax, %eaxjnzL_matches16movdqa32(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pmovmskb %xmm3, %eaxtest%eax, %eaxjnzL_matches32movdqa48(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pmovmskb %xmm3, %eaxadd$64, %rditest%eax, %eaxjnzL_matches0mov%rdi, %rcxand$-64, %rdiand$63, %rcxadd%rcx, %rdx.p2align 4L_align64_loop:sub$64, %rdxjbeL_exit_loopmovdqa(%rdi), %xmm0movdqa16(%rdi), %xmm2movdqa32(%rdi), %xmm3movdqa48(%rdi), %xmm4pcmpeqb%xmm1, %xmm0pcmpeqb%xmm1, %xmm2pcmpeqb%xmm1, %xmm3pcmpeqb%xmm1, %xmm4pmaxub%xmm0, %xmm3pmaxub%xmm2, %xmm4pmaxub%xmm3, %xmm4pmovmskb %xmm4, %eaxadd$64, %rditest%eax, %eaxjzL_align64_loopsub$64, %rdipmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matchespmovmskb %xmm2, %eaxtest%eax, %eaxjnzL_matches16movdqa32(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pcmpeqb48(%rdi), %xmm1pmovmskb %xmm3, %eaxtest%eax, %eaxjnzL_matches32pmovmskb %xmm1, %eaxbsf%eax, %eaxlea48(%rdi, %rax), %raxret.p2align 4L_exit_loop:add$32, %rdxjleL_exit_loop_32movdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matchesmovdqa16(%rdi), %xmm2pcmpeqb%xmm1, %xmm2pmovmskb %xmm2, %eaxtest%eax, %eaxjnzL_matches16movdqa32(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pmovmskb %xmm3, %eaxtest%eax, %eaxjnzL_matches32_1sub$16, %rdxjleL_return_nullpcmpeqb48(%rdi), %xmm1pmovmskb %xmm1, %eaxtest%eax, %eaxjnzL_matches48_1xor%rax, %raxret.p2align 4L_exit_loop_32:add$32, %rdxmovdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matches_1sub$16, %rdxjbeL_return_nullpcmpeqb16(%rdi), %xmm1pmovmskb %xmm1, %eaxtest%eax, %eaxjnzL_matches16_1xor%rax, %raxret.p2align 4L_matches0:bsf%eax, %eaxlea-16(%rax, %rdi), %raxret.p2align 4L_matches:bsf%eax, %eaxadd%rdi, %raxret.p2align 4L_matches16:bsf%eax, %eaxlea16(%rax, %rdi), %raxret.p2align 4L_matches32:bsf%eax, %eaxlea32(%rax, %rdi), %raxret.p2align 4L_matches_1:bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulladd%rdi, %raxret.p2align 4L_matches16_1:bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulllea16(%rdi, %rax), %raxret.p2align 4L_matches32_1:bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulllea32(%rdi, %rax), %raxret.p2align 4L_matches48_1:bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulllea48(%rdi, %rax), %raxret.p2align 4L_return_null:xor%rax, %raxret.type memchr_sse, @function;.size memchr_sse, .-memchr_sse;

Test stub

Stub. c

#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include <stdint.h>#include "common.h"extern void *memchr_sse(const void *s, int c, size_t n);int main(int argc, char **argv){char text[1024] = {0};void *result = NULL;uint64_t begin, end;memset(text, 'A', 1024);text[1022] = '\r';begin = get_cycle_count();//result = memchr_sse(text, '\r', 1024);result = memchr(text, '\r', 1024);end = get_cycle_count();if (result){printf("result @ %u cost %lu\n", result - (void *)text, end - begin);}return 0;}

Compile

Gcc-March = corei7-O3 memchr_sse.s stub. C-o stub

Test Platform:

Intel (r) Xeon (r) CPU e31230 @ 3.20 GHz

Memchr test results

Result @ 1022 cost 1404
#./Stub
Result @ 1022 cost 1600
#./Stub
Result @ 1022 cost 1452
#./Stub
Result @ 1022 cost 1388
#./Stub
Result @ 1022 cost 1440

Memchr_sse test results
#./Stub
Result @ 1022 cost 524
#./Stub
Result @ 1022 cost 568
#./Stub
Result @ 1022 cost 572
#./Stub
Result @ 1022 cost 612
#./Stub
Result @ 1022 cost 524
#./Stub
Result @ 1022 cost 520

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.