//
Http://stackoverflow.com/questions/1715224/very-fast-memcpy-for-image-processing
Courtesy
William Chan and Google. 30-70% faster than memcpy in Microsoft Visual Studio 2005.
void X_aligned_memcpy_sse2(void* dest, const void* src, const unsigned long size_t){ __asm { mov esi, src; //src pointer mov edi, dest; //dest pointer mov ebx, size_t; //ebx is our counter shr ebx, 7; //divide by 128 (8 * 128bit registers) loop_copy: prefetchnta 128[ESI]; //SSE2 prefetch prefetchnta 160[ESI]; prefetchnta 192[ESI]; prefetchnta 224[ESI]; movdqa xmm0, 0[ESI]; //move data from src to registers movdqa xmm1, 16[ESI]; movdqa xmm2, 32[ESI]; movdqa xmm3, 48[ESI]; movdqa xmm4, 64[ESI]; movdqa xmm5, 80[ESI]; movdqa xmm6, 96[ESI]; movdqa xmm7, 112[ESI]; movntdq 0[EDI], xmm0; //move data from registers to dest movntdq 16[EDI], xmm1; movntdq 32[EDI], xmm2; movntdq 48[EDI], xmm3; movntdq 64[EDI], xmm4; movntdq 80[EDI], xmm5; movntdq 96[EDI], xmm6; movntdq 112[EDI], xmm7; add esi, 128; add edi, 128; dec ebx; jnz loop_copy; //loop please loop_copy_end: }}
You may be able to optimize it further depending on your exact situation and any assumptions you are able to make.
You may also want to check out the memcpy source (memcpy. ASM) and strip out its special case handling. It may be possible to optimise further!