CPU and code optimization (1): Replace If-else with ternary operator to reduce CPU branch predictive penalty; function 13 times times speed Up (Unity).

Last Update:2018-07-25 Source: Internet

Author: User

Tags min advantage intel core i5

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Test Object

1,unity script (C #)
2,c# DLL (MCS build dynamic link library imported into unity)
3,c Native Code (LLVM compiled after Unity) is tested function source code

(C #): Two random number array for size comparison, one array to save large number, and another to save decimals.

    public void Minmax1_csharp (double[] a,double[] b,int n) {
        int i;
        for (i = 0; i < n; i++) {
            if (a [i] > b [i]) {
                Double t = a [i];
                A [i] = b [i];
                b [i] = t;
            }
        }
    }

    public void Minmax2_csharp (double[] a,double[] b,int n) {
        int i;
        for (i = 0; i < n; i++) {
            double min = a [i] < b [i]? A [i]: b [i];
            Double max = a [i] < b [i]? b [i]: a [i];
            A [i] = min;
            b [i] = max;
        }
    }

extern void Minmax1 (double a[],double b[],int n) {
    int i;
    for (i = 0; i < n; i++) {
        if (a [i] > b [i]) {
            Double t = a [i];
            A [i] = b [i];
            b [i] = t;
        }
    }
}

extern void Minmax2 (double a[],double b[],int n) {
    int i;
    for (i = 0; i < n; i++) {
        double min = a [i] < b [i]? A [i]: b [i];
        Double max = a [i] < b [i]? b [i]: a [i];
        A [i] = min;
        b [i] = max;
    }
}

test Script:

Using System.Collections;
Using System.Collections.Generic;
Using Unityengine;

Using System.Runtime.InteropServices;
    public class Conditionaltrans:monobehaviour {private double[] array1_double=new double[1000000];

    Private double[] Array2_double=new double[1000000];
        void Start () {int ptime;

        int cTime;
            for (int i = 0; i < 1000000; i++) {array1_double [i] = (Double) random.range (1f, 100000f);

        array2_double [i] = (Double) random.range (1f, 100000f);
        } ptime = System.Environment.TickCount;
        Minmax1 (array1_double,array2_double,1000000);
        CTime = System.Environment.TickCount;


        Debug.Log ("Mm1:time:" + (Ctime-ptime));
            for (int i = 0; i < 1000000; i++) {array1_double [i] = (Double) random.range (1f, 100000f);
        array2_double [i] = (Double) random.range (1f, 100000f);
        } ptime = System.Environment.TickCount; MINMAX2 (array1_double, array2_double,1000000);
        CTime = System.Environment.TickCount;


        Debug.Log ("Mm2:time:" + (Ctime-ptime));
            for (int i = 0; i < 1000000; i++) {array1_double [i] = (Double) random.range (1f, 100000f);
        array2_double [i] = (Double) random.range (1f, 100000f);
        } ptime = System.Environment.TickCount;
        Minmax1_csharp (array1_double,array2_double,1000000);
        CTime = System.Environment.TickCount;


        Debug.Log ("Mm1_cs:time:" + (Ctime-ptime));
            for (int i = 0; i < 1000000; i++) {array1_double [i] = (Double) random.range (1f, 100000f);
        array2_double [i] = (Double) random.range (1f, 100000f);
        } ptime = System.Environment.TickCount;
        Minmax2_csharp (array1_double,array2_double,1000000);
        CTime = System.Environment.TickCount;




        Debug.Log ("Mm2_cs:time:" + (Ctime-ptime));

        Libtest6.myclass mc = new Libtest6.myclass (); for (inti = 0; i < 1000000;
            i++) {array1_double [i] = (Double) random.range (1f, 100000f);
        array2_double [i] = (Double) random.range (1f, 100000f);
        } ptime = System.Environment.TickCount; Mc.
        Csdll_minmax1 (array1_double,array2_double,1000000);
        CTime = System.Environment.TickCount;

        Debug.Log ("Mm1_csdll:time:" + (Ctime-ptime));
            for (int i = 0; i < 1000000; i++) {array1_double [i] = (Double) random.range (1f, 100000f);
        array2_double [i] = (Double) random.range (1f, 100000f);
        } ptime = System.Environment.TickCount; Mc.
        CSDLL_MINMAX2 (array1_double,array2_double,1000000);
        CTime = System.Environment.TickCount;


    Debug.Log ("Mm2_csdll:time:" + (Ctime-ptime));
        } public void Minmax1_csharp (double[] a,double[] b,int n) {int i;
                for (i = 0; i < n; i++) {if (a [i] > b [i]) {Double T = a [i]; A [I] = b [i];
            b [i] = t;
        }}} public void Minmax2_csharp (double[] a,double[] b,int n) {int i;
            for (i = 0; i < n; i++) {Double min = a [i] < b [i]? A [i]: b [i]; Double max = a [i] < b [i]?
            b [i]: a [i];
            A [i] = min;
        b [i] = max; }} [DllImport ("C_plugin", exactspelling=true,entrypoint= "Minmax1")] private static extern void Minmax1 (double

    [] a,double[] b,int N);
[DllImport ("C_plugin")] private static extern void Minmax2 (double[] a,double[] b,int N);
 }

test Environment:

cpu:2.6 GHz Intel Core i5
Memory: 8 GB DDR3 MHz
Ide:xcode Version 9.1 (9b55)
Mono Develop 5.9.6
UNITY:2017.1.1F1 Personal compiler and optimization level:

1,unity Script Mcs/.net JIT unknown
2,c# DLL mcs/.net JIT Unknown
3,c Native Llvm-o3 test results:

Two groups of 1 million double random numbers compare size after swap
1,unity script: 21 MS (MINMAX1) 26 MS (MINMAX2)
2,c# DLL 15 ms (MINMAX1) 24 ms (MINMAX2)
3,c Native 5 ms (MINMAX1) 2 ms (MINMAX2)
Screenshot of test Result:
Analyze the cil/machine code generated by each version of the function:

First understand the C # compilation process
C # Script: The source is first compiled by MCS into CIL (Microsoft intermediate language), saved in Assembly-csharp.dll, and in Unity, the. NET JIT compiler compiles CIL into machine code in real-time as needed and is executed by the machine.
The content in C # Dll:c#dll is still CIL, as above, which is compiled into machine code at run time by JIT.
Since JIT is compiled in real time, the Mac does not know how to get the machine code it generates. Mono also has an AOT compiler that can precompile CIL, and the following will parse the AOT-generated machine code and optimize the level to-optimize=all. But in fact, because of the different operating mechanisms of the two compilers lead to different optimization strategies, the relative generation of machine code is unlikely to be identical, can only hope that the difference is not large.

From slowest to fastest analysis: 1,unity script Minmax2

Same MonoDevelop compiled with C # DLL 2,unity script Minmax1

3,c# dll with C # DLL MINMAX2

CIL code:

IL_0041 is the start of the For loop, ilxxx ends with B are branch branch jumps, the for Loop has 4 branch jumps, it appears that the three-dimensional operator translated into 4 if.

Machine code:

Red Line within the range is for loop, loop within four jumps, faithfully executed the CIL version of the logic, COMISD instruction comparison register low 64 bits, no optimization. 4,c# DLL Minmax1

CIL code:

There is only one branch jump in the loop that starts at il_0002, and the same as the source logic.

Machine code:

There is only one COMISD comparison within a loop, and a branch jump within a loop is fully compliant with its upper-level language logic. 5,c DLL Minmax1

Machine code:

There are two times in the loop COMISD comparison and jump (source code once), two times the reason for the jump is not understanding, is not sure whether the loop expansion. (and two times the comparison is Jbe, and write to the register is not to choose the branch of the case, this is not a dead loop, Khan-_-:, is the anti-compilation software error ...) 6,c DLL Minmax2

Machine code:

The ternary operator is translated in order to minsd/maxsd, there is no branch jump within the loop, and 3 XMM registers are used, the visual inspection is to reduce the read-write correlation, improve the parallelism of the instruction, and have two loop expansion. Summary: 1, the number of branch jumps (unpredictable) at the machine instruction level is inversely proportional to the speed of the function.

1,unity script: 21 MS (MINMAX1)/1 26 ms (MINMAX2)/4
2,c# DLL 15 ms (MINMAX1)/1 24 ms (MINMAX2)/4
3,c Native 5 ms (MINMAX1)/2 2 msec (MINMAX2)/0

For modern CPUs, many instructions are executed at the same time by different hardware units, and even in a hardware unit, different stages have different instructions to execute simultaneously. In order to pursue the speed, the CPU encounters the branch jump instruction, does not wait for the previous instruction comparison result, but predicts a result (both jumps or does not jump), and continues to launch the instruction, these before compares the result to emit the instruction to execute normally but does not write the memory, when compares the result, If the prediction is correct, write to memory, if the prediction error, must be canceled all and back to the branch to fetch the reference, the process of the loss of time is both a branch prediction penalty. This branch prediction strategy is quite reasonable when the branching results are better predictions (in another set of tests, the variables in each array 1 are 1 larger than the variables in array 2, and all versions of the functions run faster than the C Natice Minmax2). However, when this case is simply unpredictable, the CPU can only gamble speculative speculation, the inevitable large number of branch prediction penalties affect the speed of the program. 2, C # DLLs are a little faster than C # scripts.

But considering (1), a case cannot represent generality, (2), the advantage is too weak. A more conservative judgment is that C # DLLs and unity scripts have pros and cons in different situations. 3,c Native+llvm-o3 Compilation has an absolute advantage in speed

Whether it is the test data, or the generated machine code in the theoretical rationality of C is a victory. 4, the source level on the ternary operator is not necessarily better than if-else

From a logical point of view, the ternary operator conversion to Min/max or Cmov CPU instructions should be completely reasonable, but due to the current compiler permissions too high, the source code generated by the machine code is determined by the compiler's ability to analyze. In this case, the intelligence of the MCS compiler is clearly limited (or the limitations of the CIL language). Because the ternary operator has been translated to branch jump at the CIL level, the JIT optimization logic for this problem is unknown.
The LLVM of Xcode reasonably converts the ternary operator to Min/max instruction at the-o3 optimization level, but at other optimization levels it is still wrong to select branch jumps.

————————————————————————————————————————————————————
Reference:
In-depth understanding of computer system-R.E Bryant,d.r.hallaron

Maintenance log:

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More