have been writing programs under the DM642 platform, and most of their work is also related to optimization. In order to understand the optimization rules of TI CCS compiler more clearly, the following experiments were done:
Write 7 functions of the same function, the same operation, but the implementation of the same way, the time is not the same as natural operation:
----------------------Debug Mode----------------------------------
Combine1 (V, &dest) time elapse:0.028967 Ms.
Combine2 (V, &dest) time elapse:0.024751 Ms.
Combine3 (V, &dest) time elapse:0.023541 Ms.
Combine4 (V, &dest) time elapse:0.012635 Ms.
combine4p (V, &dest) time elapse:0.012090 Ms.
combine5p (V, &dest) time elapse:0.007475 Ms.
Combine6 (V, &dest) time elapse:0.011119 Ms.
combine6p (V, &dest) time elapse:0.006823 Ms.
Combine1 > Combine2 > Combine3 > Combine4 > conbine4p > Combine6 > conbine5p > combine6p
Because there is no optimization, running in debug mode, this result is more consistent with what you expect. From Combine1 to combine6p, it reduces function invocation and access to memory in turn, as well as the expansion of loops and so on. Naturally it takes less and less time. However, when I opened the-o3 optimization, the result puzzled me:
------------------Release Mode---------------------------------------
Combine1 (V, &dest) time elapse:0.011491 Ms.
Combine2 (V, &dest) time elapse:0.009616 Ms.
Combine3 (V, &dest) time elapse:0.009603 Ms.
Combine4 (V, &dest) time elapse:0.003884 Ms.
combine4p (V, &dest) time elapse:0.004096 Ms.
combine5p (V, &dest) time elapse:0.005573 Ms.
Combine6 (V, &dest) time elapse:0.005120 Ms.
combine6p (V, &dest) time elapse:0.004987 Ms.
The most notable changes are combine4 and combine4p. The decrease is the largest, and it becomes the shortest function.
Take a closer look at the core loops in the combine4:
for (i = 0; i < len; i++)
{
x = x OPER data[i];
}
There is no artificial circulation here. And it is precisely because of this that the compiler optimizes it even more than the function that the human loop expands.
This makes it the fastest function in a flash. And the only difference between combine4 and combine4p is that one uses an array, one with a pointer.
And here's a bit of proof: The compiler's array of optimizations is greater than the pointer. Although the difference is not too big.
Open profile View:
0:0x800202c0-0x80020374,combine4, 125-143:test.c, function, 1,1580,1580,1430,1430,7,7,7,7,
0:0x800207b4-0x80020980,combine6, 217-239:test.c, function, 1,2117,2117,2021,2021,12,12,9,9,
0:0x80020344-0x80020354,combine4, 138-141:test.c, Loop, 124, 1102,1102,978,978,5,5,4,4, (Cycle 124 times, Cache hit 5/(5+4) = 0.56)
0:0x80020880-0x80020898,combine6, 227-231:test.c, loop, 1041,1041,981,981,6,6,4,4, (Circulation 60 times, Cache hit 4/10 = 0.4)
It seems that the compiler does a cyclic expansion better than we do artificially.
In addition, combine6p did a lot of cyclic expansion, but did not like the idea of the speed of the geometric progression, because the cycle to a certain extent, the memory of the access waiting time become a bottleneck.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 #include <csl.h>
5 #include <csl_cache.h>
6 #include <time.h>
7 #include "myMath.h"
8 #include "hky_testtime.h"
9
Ten #define IDENT 0
One #define OPER +
#define Vec_len 1024
13
14typedef intdata_t;
15
16typedefstruct{
17intLen
data_t *data;
}vec_rec, *vec_ptr;
20
Vec_ptr New_vec (intLen);
22intGet_vec_element (Vec_ptr V,intIndex, data_t *dest);
23intVec_length (Vec_ptr v);
24
25voidCombine1 (Vec_ptr V, data_t *dest);
26
Vec_ptr New_vec (intLen
28 {
Vec_ptr result = (vec_ptr) malloc (sizeof(Vec_rec));
30if(!result)
31 {
32 returnNULL;
33}
34
Result->len = Len;
36
37if(Len > 0)
38 {
39intI
data_t *data = (data_t *) malloc (len *sizeof(data_t));
41if(!data)
42 {
Free (void*) result);
44 returnNULL;
45}
Result->data = data;
47
48 for(i = 0; i < len; i++)
49 {
Data[i] = i;
51}
52}
53Else
54 {
Result->data = NULL;
56}
57 returnResult
58}
59
60intGet_vec_element (Vec_ptr V,intIndex, data_t *dest)
61 {
62if(Index < 0 | | | index >= v->len)
63 {
64 return0;
65}
*dest = v->data[index];
67 return1;
68}
69
70intVec_length (Vec_ptr v)
71 {
72 returnv->len;
73}
74
75voidCombine1 (Vec_ptr V, data_t *dest)
76 {
77intI
78
*dest = IDENT;
80
81 for(i = 0; i < vec_length (v); i++)
82 {
data_t Val;
Get_vec_element (V, I, &val);
*dest = *dest OPER val;
86}
87}
88
89voidCombine2 (Vec_ptr V, data_t *dest)
90 {
91intI
92intLen
93
*dest = IDENT;
Len = Vec_length (v);
96
97 for(i = 0; i < len; i++)
98 {
data_t Val;
Get_vec_element (V, I, &val);
*dest = *dest OPER val;
102}
103}
104
105voidCombine3 (Vec_ptr V, data_t *dest)
106 {
107intI
108intLen
109 data_t x;
110
*dest = IDENT;
112 x = IDENT;
113
114 len = vec_length (v);
115
116 for(i = 0; i < len; i++)
117 {
118 data_t Val;
119 Get_vec_element (V, I, &val);
x = x OPER val;
121}
122 *dest = x;
123}
124
125voidCombine4 (Vec_ptr V, data_t *dest)
126 {
127intI
128intLen
129 data_t x;
130 data_t *data;
131
132 *dest = IDENT;
The IDENT x = the;
134
135 len = vec_length (v);
136 data = v->data;
137
138 for(i = 0; i < len; i++)
139 {
140 x = x OPER data[i];
141}
*dest = x;
143}
144
145voidcombine4p (Vec_ptr V, data_t *dest)
146 {
147intI
148intLen
149 data_t x;
data_t *data;
151 data_t *dend;
152
153 *dest = IDENT;
154 x = IDENT;
155
156 len = vec_length (v);
157 data = v->data;
158 dend = data + len;
159
160 for(; data < dend; data++)
161 {
162 x = x OPER (*data);
163}
164 *dest = x;
165}
166voidCombine5 (Vec_ptr V, data_t *dest)
167 {
168intI
169intLen
170 data_t x;
171 data_t *data;
172
173 *dest = IDENT;
174 x = IDENT;
175
176 len = Vec_length (v);
177 data = v->data;
178
179 for(i = 0; i < len; i + = 2)
180 {
181 x = x OPER data[i];
OPER x = x data[i+1];
183}
184 *dest = x;
185}
186voidcombine5p (Vec_ptr V, data_t *dest)
187 {
188intI
189intLen
190 data_t x;
191 data_t *data;
*dend data_t;
193 data_t *dlimit;
194
195 *dest = IDENT;
196 x = IDENT;
197
198 len = vec_length (v);
199 data = v->data;
DEnd = data + len;
201 Dlimit = dend-7;
202
203 for(; data < dlimit; data + + 8)
204 {
205 x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3]
206 OPER data[4] OPER data[5] OPER data[6] OPER data[7];
207}
208
209 for(; data < dend; data++)
210 {
211 x = x OPER data[0];
212}
213
214 *dest = x;
215}
216
217voidCombine6 (Vec_ptr V, data_t *dest)
218 {
219intLength = Vec_length (v);
220intLimit = length-1;
221intI
222
223 data_t *data = v->data;
224 data_t x0 = IDENT;
data_t x1 = IDENT;
226
227 for(i = 0; i < limit i + + 2)
228 {
229 x0 = x0 OPER data[i];
230 x1 = x1 OPER data[i+1];
231}
232
233 for(; i < length; i++)
234 {
235 x0 = x0 OPER data[i];
236}
237
238 *dest = x0 OPER x1;
239}
240
241voidcombine6p (Vec_ptr V, data_t *dest)
242 {
243intI
244intLen
245 data_t x;
246 data_t *data;
247 data_t *dend;
248 data_t *dlimit;
249
*dest = IDENT;
251 x = IDENT;
252
253 len = vec_length (v);
254 data = v->data;
255 DEnd = data + len;
256 dlimit = dend-15;
257
258 for(; data < dlimit; data + + 16)
259 {
OPER x = x data[0] OPER data[1] OPER data[2] OPER data[3] OPER
261 data[4] OPER data[5] OPER data[6] OPER data[7] OPER
262 data[8] OPER data[9] OPER data[10] OPER data[11] OPER
263 data[12] OPER data[13] OPER data[14] OPER data[15];
264}
265
266 for(; data < dend; data++)
267 {
268 x = x OPER data[0</