has been writing programs under the DM642 platform, and most of the work that you do is also related to optimization. In order to understand the optimization rules of TI CCS compiler more clearly, the following experiments are done:
Wrote 7 functions of the same function, the same operation, but the implementation of the same way, the natural run time is not the same:
----------------------Debug Mode----------------------------------
Combine1 (V, &dest) time elapse:0.028967 Ms.
Combine2 (V, &dest) time elapse:0.024751 Ms.
Combine3 (V, &dest) time elapse:0.023541 Ms.
Combine4 (V, &dest) time elapse:0.012635 Ms.
combine4p (V, &dest) time elapse:0.012090 Ms.
combine5p (V, &dest) time elapse:0.007475 Ms.
Combine6 (V, &dest) time elapse:0.011119 Ms.
combine6p (V, &dest) time elapse:0.006823 Ms.
Combine1 > Combine2 > Combine3 > Combine4 > conbine4p > Combine6 > conbine5p > combine6p
Because it is not optimized to run in debug mode, the result is the same as expected. From Combine1 to combine6p, the function calls and accesses to the memory are reduced in turn, and the loop is expanded and so on. Naturally, it takes less and less time. However, when I opened the-o3 optimization, I was puzzled by the result:
------------------Release Mode---------------------------------------
Combine1 (V, &dest) time elapse:0.011491 Ms.
Combine2 (V, &dest) time elapse:0.009616 Ms.
Combine3 (V, &dest) time elapse:0.009603 Ms.
Combine4 (V, &dest) time elapse:0.003884 Ms.
combine4p (V, &dest) time elapse:0.004096 Ms.
combine5p (V, &dest) time elapse:0.005573 Ms.
Combine6 (V, &dest) time elapse:0.005120 Ms.
combine6p (V, &dest) time elapse:0.004987 Ms.
The most notable changes are combine4 and combine4p. The drop is the largest and is the shortest time-consuming function.
Take a closer look at the core loops in the combine4:
for (i = 0; i < len; i++)
{
x = x OPER data[i];
}
There are no artificial loops to unfold. It is precisely because of this that the compiler optimizes it even more than the human-loop expansion function.
This makes it suddenly the fastest function. And the only difference between combine4 and combine4p is that one uses arrays, one with pointers.
It is also proved here that the compiler is more optimized for arrays than pointers. Although the difference is not too big.
Open profile View:
0:0x800202c0-0x80020374,combine4, 125-143:test.c, function, 1,1580,1580,1430,1430,7,7,7,7,
0:0x800207b4-0x80020980,combine6, 217-239:test.c, function, 1,2117,2117,2021,2021,12,12,9,9,
0:0x80020344-0x80020354,combine4, 138-141:test.c, Loop, 124, 1102,1102,978,978,5,5,4,4, (Loop 124 times, Cache hit 5/(5+4) = 0.56)
0:0x80020880-0x80020898,combine6, 227-231:test.c, Loop, three, 1041,1041,981,981,6,6,4,4, (60 cycles, Cache hit 4/10 = 0.4)
It seems that the compiler is doing a cyclic expansion better than what we do artificially.
In addition, combine6p did a lot of cyclic expansion, but did not as expected to have a geometric progression of speed, because the loop to a certain extent, the memory of access waiting time becomes a bottleneck.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 #include <csl.h>
5 #include <csl_cache.h>
6 #include <time.h>
7 #include "myMath.h"
8 #include "hky_testtime.h"
9
#define IDENT 0
#define OPER +
#define Vec_len 1024
13
14typedef intdata_t;
15
16typedefstruct{
17intLen
data_t *data;
}vec_rec, *vec_ptr;
20
Vec_ptr New_vec (intLen);
22intGet_vec_element (Vec_ptr V,intIndex, data_t *dest);
23intVec_length (Vec_ptr v);
24
25voidCombine1 (Vec_ptr V, data_t *dest);
26
Vec_ptr New_vec (intLen
28 {
Vec_ptr result = (vec_ptr) malloc (sizeof(Vec_rec));
30if(!result)
31 {
32returnNULL;
33}
34
Result->len = Len;
36
37if(Len > 0)
38 {
39intI
data_t *data = (data_t *) malloc (len *sizeof(data_t));
41if(!data)
42 {
(void*) result);
44returnNULL;
45}
Result->data = data;
47
48 for(i = 0; i < len; i++)
49 {
Data[i] = i;
51}
52}
53Else
54 {
Result->data = NULL;
56}
57returnResult
58}
59
60intGet_vec_element (Vec_ptr V,intIndex, data_t *dest)
61 {
62if(Index < 0 | | index >= v->len)
63 {
64return0;
65}
*dest = v->data[index];
67return1;
68}
69
70intVec_length (Vec_ptr v)
71 {
72returnv->len;
73}
74
75voidCombine1 (Vec_ptr V, data_t *dest)
76 {
77intI
78
*dest = IDENT;
80
81 for(i = 0; i < vec_length (v); i++)
82 {
data_t Val;
Get_vec_element (V, I, &val);
*dest = *dest OPER val;
86}
87}
88
89voidCombine2 (Vec_ptr V, data_t *dest)
90 {
91intI
92intLen
93
94 *dest = IDENT;
Len = Vec_length (v);
96
97 for(i = 0; i < len; i++)
98 {
data_t Val;
Get_vec_element (V, I, &val);
101 *dest = *dest OPER val;
102}
103}
104
105voidCombine3 (Vec_ptr V, data_t *dest)
106 {
107intI
108intLen
109 data_t x;
110
111 *dest = IDENT;
x = IDENT;
113
Len = Vec_length (v);
115
116 for(i = 0; i < len; i++)
117 {
118 data_t Val;
119 Get_vec_element (V, I, &val);
x = x OPER val;
121}
122 *dest = x;
123}
124
125voidCombine4 (Vec_ptr V, data_t *dest)
126 {
127intI
128intLen
129 data_t x;
data_t *data;
131
*dest = IDENT;
133 x = IDENT;
134
135 len = vec_length (v);
136 data = v->data;
137
138 for(i = 0; i < len; i++)
139 {
OPER x = x data[i];
141}
142 *dest = x;
143}
144
145voidcombine4p (Vec_ptr V, data_t *dest)
146 {
147intI
148intLen
149 data_t x;
data_t *data;
151 data_t *dend;
152
153 *dest = IDENT;
154 x = IDENT;
155
156 len = vec_length (v);
157 data = v->data;
158 dend = data + len;
159
160 for(; data < dend; data++)
161 {
162 x = x OPER (*data);
163}
164 *dest = x;
165}
166voidCombine5 (Vec_ptr V, data_t *dest)
167 {
168intI
169intLen
data_t x;
171 data_t *data;
172
173 *dest = IDENT;
174 x = IDENT;
175
176 len = Vec_length (v);
177 data = v->data;
178
179 for(i = 0; i < len; i + = 2)
180 {
181 x = x OPER data[i];
182 x = x OPER data[i+1];
183}
184 *dest = x;
185}
186voidcombine5p (Vec_ptr V, data_t *dest)
187 {
188intI
189intLen
data_t x;
191 data_t *data;
192 data_t *dend;
193 data_t *dlimit;
194
195 *dest = IDENT;
196 x = IDENT;
197
198 len = vec_length (v);
199 data = v->data;
DEnd = data + len;
201 Dlimit = dend-7;
202
203 for(; data < dlimit; Data + = 8)
204 {
205 x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3]
206 OPER data[4] OPER data[5] OPER data[6] OPER data[7];
207}
208
209 for(; data < dend; data++)
210 {
211 x = x OPER data[0];
212}
213
214 *dest = x;
215}
216
217voidCombine6 (Vec_ptr V, data_t *dest)
218 {
219intLength = Vec_length (v);
220intLimit = length-1;
221intI
222
223 data_t *data = v->data;
224 data_t x0 = IDENT;
225 data_t x1 = IDENT;
226
227 for(i = 0; i < limit; i + = 2)
228 {
229 x0 = x0 OPER data[i];
X1 = x1 OPER data[i+1];
231}
232
233 for(; i < length; i++)
234 {
235 x0 = x0 OPER data[i];
236}
237
238 *dest = x0 OPER x1;
239}
240
241voidcombine6p (Vec_ptr V, data_t *dest)
242 {
243intI
244intLen
245 data_t x;
246 data_t *data;
247 data_t *dend;
248 data_t *dlimit;
249
*dest = IDENT;
251 x = IDENT;
252
253 len = vec_length (v);
254 data = v->data;
255 DEnd = data + len;
Dlimit = dend-15;
257
258 for(; data < dlimit; Data + = 16)
259 {
260 x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3] OPER
261 data[4] OPER data[5] OPER data[6] OPER data[7] OPER
262 data[8] OPER data[9] OPER data[10] OPER data[11] OPER
263 data[12] OPER data[13] OPER data[14] OPER data[15];
264}
265
266 for(; data < dend; data++)
267 {
268 x = x OPER data[0</