The following example shows two-dimensional arrays with Double Precision Floating Point: Double [512] [1024], a total of 4 MB data, divided into 512 rows and 1024 columns, each column is 8 bytes. Now we add the first element of each row of each pair of floating point groups, and save the result to an array allocated on the stack.
The algorithm Assembly file is as follows:
.align 2
.text
.globl _fast_int_sqrt
.globl _get_cycles
.globl _naive_calc
.globl _opt_calc
_get_cycles:
rdtsc
shl $32, %rdx
or %rdx, %rax
ret
_naive_calc:
mov $100, %r9
native_loop:
mov $512, %r8
xor %rcx, %rcx
xor %rax, %rax
naive_process:
fldl (%rsi, %rcx)
fldl (%rdx, %rcx)
fadd %st(0), %st(1)
fcmove %st(1), %st(0)
faddp %st(0), %st(1)
fstpl (%rdi, %rax)
add $(8 * 1024), %rcx
add $8, %rax
sub $1, %r8
jne naive_process
sub $1, %r9
jne native_loop
ret
_opt_calc:
mov $100, %r9
opt_loop:
mov $512, %r8
xor %rcx, %rcx
xor %rax, %rax
opt_process:
prefetcht0 (8 * 1024)(%rsi, %rcx)
prefetcht0 (8 * 1024)(%rdx, %rcx)
fldl (%rsi, %rcx)
fldl (%rdx, %rcx)
fadd %st(0), %st(1)
fcmove %st(1), %st(0)
faddp %st(0), %st(1)
fstpl (%rdi, %rax)
add $(8 * 1024), %rcx
add $8, %rax
sub $1, %r8
jne opt_process
sub $1, %r9
jne opt_loop
ret
The only difference between opt_calc and naive_calc is that opt_calc has two prefetch commands.
Below are the test functions in C language:
#include <stdio.h>#include <stdlib.h>#include <string.h>extern unsigned long get_cycles(void);// Calculate 2 * (src1 + src2)extern void naive_calc(void *dst, const void *src1, const void *src2);extern void opt_calc(void *dst, const void *src1, const void *src2);extern void hey(int y);void hey(int y){ // Initialize the buffers void *src1 = malloc(8 * 1024 * 512); void *src2 = malloc(8 * 1024 * 512); double dst[512]; memset(src1, 0, 8 * 1024 * 512); memset(src2, 0, 8 * 1024 * 512); double *p = (double*)src1; *p = 3.141; p = (double*)src2; *p = 2.163; // naive unsigned long t1 = get_cycles(); naive_calc(dst, src1, src2); // rdi, rsi, rdx unsigned long t2 = get_cycles(); // opt unsigned long t3 = get_cycles(); opt_calc(dst, src1, src2); unsigned long t4 = get_cycles(); printf("Naive cycles is: %lu\n", t2 - t1); printf("Opt cycles: %lu\n", t4 - t3); printf("The value is: %f\n", dst[0]); free(src1); free(src2);}
In order to increase the calculation cycle, multiply by 2.
The performance of opt_calc can be increased by about 8% to 20% in Mac Mini CPU p7350, GHz dual-core Intel Core Architecture Based on 45nm technology.