#include <stdio.h>#include<intrin.h>#include<iostream>#include<ctime>using namespacestd;voidTest1 (Double*a,Double*b,Double*re) {size_t T=clock (); for(intK =0; K <4; k++) { for(inti =0; I <100000000; i++) {Re[i]= A[i] +B[i]; }} size_t en=clock (); cout<< en-t <<Endl;}voidTest2 (Double*a,Double*b,Double*re) {size_t T=clock (); __m256d M1, M2; for(intK =0; K <4; k++) { for(inti =0; I <100000000; i + =4) {M1= _MM256_SET_PD (A[i], a[i +1], A[i +2], A[i +3]); M2= _MM256_SET_PD (B[i], b[i +1], B[i +2], B[i +3]); __m256d L1=_MM256_ADD_PD (M1, M2); Re[i+3] = l1.m256d_f64[0]; Re[i+2] = l1.m256d_f64[1]; Re[i+1] = l1.m256d_f64[2]; Re[i]= l1.m256d_f64[3]; }} size_t en=clock (); cout<< en-t <<Endl;}intMainintargcChar*argv[]) { Double*a =New Double[100000000]; Double*b =New Double[100000000]; Double*re =New Double[100000000]; for(inti =0; I <100000000; i++) {A[i]=i; B[i]=i; } test1 (A, B, re); Test2 (A, B, re); Delete[] A; Delete[] b; Delete[] re; System ("Pause"); return 0;}
It could be about 100 milliseconds faster.
Multimedia instruction (AVX accelerated array summation)