Project Euler solution optimized using SSE2

Source: Internet
Author: User
Tags diff

Http://www.mathblog.dk/project-euler-46-odd-number-prime-square/

Not a hard one to code, but it can be optimized using SSE2 instructions. The code below runs with g++ 4.8.1:

g++-g-c RIDDLE.CPP-STD=C++11-MSSE2-PG
g++-O riddle.exe riddle.o-pg
Objdump-d-M intel-s riddle.o > Assembly.txt
Riddle
Gprof Riddle.exe gmon.out > Report.txt

And here is the code:

#if defined (__sse2__) #include <xmmintrin.h>//sse #include <emmintrin.h>//sse2 #endif #include <

ctime> #include <cstdio> #include <iostream> #include <chrono> using namespace std;

typedef unsigned long UL;
#define MAX_PRIME_CNT #define MAX_CNT 10001 extern int primes[max_prime_cnt];
BOOL Squarem[max_cnt] = {false};

int startprimeinx[max_cnt]; #if defined (__sse2__)//Debug only void printm128i (const __m128i &v) {unsigned* p = (unsigned*)
        &v;
    cout << p[0] << ":" << p[1] << ":" << p[2] << ":" << p[3] << Endl;
    }//Calculate a[i] * b[i], with i[0..3] int sse_v[4] = {0};
        Inline __m128i pwr2_sse (const int &AMP;A, const int &b) {sse_v[0] = A; sse_v[2] = b;        
        __m128i mv = _mm_loadu_si128 ((__m128i *) sse_v);
    Return _mm_mul_epu32 (MV, MV);
 }//Calculate (A[i]-b[i]) >> 1, with i[0..3]   inline __m128i sub4_and_shl1_sse (int a[4], int *b) {__m128i va = _mm_loadu_si128 ((__m128i *) a);                        
        __m128i VP = _mm_loadu_si128 ((__m128i *) b);    
    Return _mm_srli_epi32 (_MM_SUB_EPI32 (VA, VP), 1);

    } #endif int main () {Auto start = Std::chrono::high_resolution_clock::now ();
    Mark Perfect Square Numbers int vec4[4] = {0};     
        for (int i = 0; i < +=4) {#if defined (__sse2__) __m128i r = Pwr2_sse (i, i + 1);
        unsigned* val = (unsigned*) &r;

        Squarem[val[0]] = squarem[val[2]] = true;     
        R = Pwr2_sse (i + 2, i + 3);
        val = (unsigned*) &r;   
Squarem[val[0]] = squarem[val[2]] = true;  #else squarem[i * i] = squarem[(i + 1) * (i + 1)] = squarem[(i + 2) * (i +          
2)] = squarem[(i + 3) * (i + 3)] = true;
 #endif}//pre-calculate start Prime Index   Register UL Prevprime, currprime;
        for (int i = 1; i < max_prime_cnt; i + +) {prevprime = primes[i-1];
        Currprime = Primes[i];
        Startprimeinx[prevprime] =-2;       
    for (int j = Prevprime + 2, J < Currprime; J +=2)//Skip all evens startprimeinx[j] = i-1;
    }//Main Logic Register UL v = 1;
    register int offset; 
        while (v + = 2) {//register int offset = find1stsmallerprime (v); offset = startprimeinx[v];                    
        Pre-calculate it. 
            while (offset >= 0) {#if defined (__sse2__)//If we still has more than 4 primes to check, We use SSE2 ins to check 4 primes all together if (Offset > 4) {i                              
                NT Vv[4] = {V,V,V,V};                                         

                __m128i r = Sub4_and_shl1_sse (vv, primes + offset-3); Unsigned * PinX = (unsigned *) &r; if (squarem[pinx[3]) | | SQUAREM[PINX[2] | | SQUAREM[PINX[1] | |
                Squarem[pinx[0]]) break;
            Offset-= 4;
                } else {if (squarem[(V-primes[offset]) >> 1]) break;
            offset--;
            } #else if (squarem[(V-primes[offset]) >> 1]) break;
offset--;     
    #endif} if (offset = =-1) break;

    } printf ("%lu\n", V);
    Output time spent in milli-seconds auto end = Std::chrono::high_resolution_clock::now ();
    Std::chrono::d uration<double> diff = End-start;

    cout << "Time (in second):" <<diff.count () << Endl;
return 0; }//pre-loaded Primes//memory-performance exchange//int primes[max_prime_cnt] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 2 27, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 35 9, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653  , 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, 953, 967, 971, 977, 98 3, 991, 997, 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, 1103, 110 9, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259, 1 277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 157 9, 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1  723, 1733, 1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999, 2003, 2011, 2017, 2027, 202 9, 2039, 2053, 2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129, 2131, 2137, 2141, 2143, 2153, 2161, 2179, 2  203, 2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357, 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423, 2437, 2441, 2447, 2459, 2467, 2473, 247 7, 2503, 2521, 2531, 2539, 2543, 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687, 
2689, 2693, 2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741, 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, 2803, 2819 , 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903, 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 299 9, 3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079, 3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3  181, 3187, 3191, 3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331, 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413, 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 349 9, 3511, 3517, 3527, 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607, 3613, 3617, 3623, 3631, 3  637, 3643, 3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727, 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, 3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907, 3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989, 4001, 4  003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057, 4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129, 4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231, 4241, 4

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.