Coverage Report

Created: 2021-04-05 01:08

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_micro_simplify.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_micro.h"
6
7
static int _ccv_nnc_same_index_term(const ccv_nnc_micro_loop_index_term_t a_index, const ccv_nnc_micro_loop_index_term_t b_index, const int* const groups)
8
514
{
9
514
  if (a_index.type != b_index.type)
10
12
    return 0;
11
502
  const int type = a_index.type;
12
502
  switch (type)
13
502
  {
14
201
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL:
15
201
      return a_index.immediate_value == b_index.immediate_value;
16
293
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_ID:
17
293
      if (a_index.id.type != b_index.id.type)
18
0
        return 0;
19
293
      if (groups && 
(271
a_index.id.type == CCV_NNC_MICRO_AXIS_SIZE_ID271
||
a_index.id.type == CCV_NNC_MICRO_TENSOR_ID63
))
20
208
      {
21
208
        if (a_index.id.d != b_index.id.d)
22
114
          return 0;
23
94
        switch (a_index.id.type)
24
94
        {
25
94
          case CCV_NNC_MICRO_TENSOR_ID:
26
94
          case CCV_NNC_MICRO_AXIS_SIZE_ID: {
27
94
            // Find their group identifier and then compare.
28
94
            int a_root = groups[a_index.id.id];
29
94
            while (groups[a_root] != a_root)
30
0
              a_root = groups[a_root];
31
94
            int b_root = groups[b_index.id.id];
32
101
            while (groups[b_root] != b_root)
33
7
              b_root = groups[b_root];
34
94
            return a_root == b_root;
35
0
          }
36
0
        }
37
0
        return a_index.id.id == b_index.id.id;
38
0
      } else
39
85
        return (a_index.id.d == b_index.id.d && a_index.id.id == b_index.id.id);
40
8
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY: {
41
8
      return a_index.binary->op == b_index.binary->op && _ccv_nnc_same_index_term(a_index.binary->left, b_index.binary->left, groups) && _ccv_nnc_same_index_term(a_index.binary->right, b_index.binary->right, groups);
42
0
    }
43
0
  }
44
0
  return 0;
45
0
}
46
47
static int _ccv_nnc_same_shape(const ccv_nnc_micro_loop_index_term_t* const a_shape, const ccv_nnc_micro_loop_index_term_t* const b_shape, const int dimensions)
48
6
{
49
6
  int i;
50
20
  for (i = 0; i < dimensions; 
i++14
)
51
18
    if (!_ccv_nnc_same_index_term(a_shape[i], b_shape[i], 0))
52
4
      return 0;
53
6
  
return 12
;
54
6
}
55
56
static int _ccv_nnc_same_loop(const ccv_nnc_micro_loop_block_t* const left_block, const ccv_nnc_micro_loop_block_t* const right_block, const int* const groups, int* const left_loop_idx, int* const right_loop_idx)
57
14
{
58
14
  assert(left_block->loop_count > 0);
59
14
  assert(right_block->loop_count > 0);
60
14
  int i, j;
61
14
  int left_right_link[left_block->loop_count];
62
14
  int right_left_link[right_block->loop_count];
63
14
  enum {
64
14
    ONE = -2,
65
14
    UNASSIGNED = -1,
66
14
  };
67
103
  for (i = 0; i < left_block->loop_count; 
i++89
)
68
89
    if (left_block->loops[i].start_index.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL && left_block->loops[i].start_index.immediate_value == 0 &&
69
89
      left_block->loops[i].end_index.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL && 
left_block->loops[i].end_index.immediate_value == 10
)
70
0
      left_right_link[i] = ONE;
71
89
    else
72
89
      left_right_link[i] = UNASSIGNED;
73
103
  for (i = 0; i < right_block->loop_count; 
i++89
)
74
89
    if (right_block->loops[i].start_index.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL && right_block->loops[i].start_index.immediate_value == 0 &&
75
89
      right_block->loops[i].end_index.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL && 
right_block->loops[i].end_index.immediate_value == 10
)
76
0
      right_left_link[i] = ONE;
77
89
    else
78
89
      right_left_link[i] = UNASSIGNED;
79
103
  for (i = 0; i < left_block->loop_count; 
i++89
) // Find corresponding loop on the right.
80
89
  {
81
89
    if (left_right_link[i] != UNASSIGNED)
82
0
      continue;
83
89
    int flag = UNASSIGNED;
84
469
    for (j = 0; j < right_block->loop_count && 
flag == UNASSIGNED434
;
j++380
)
85
380
    {
86
380
      if (right_left_link[j] != UNASSIGNED)
87
183
        continue;
88
197
      if (_ccv_nnc_same_index_term(left_block->loops[i].start_index, right_block->loops[j].start_index, groups) && 
89
197
        _ccv_nnc_same_index_term(left_block->loops[i].end_index, right_block->loops[j].end_index, groups))
90
63
        flag = j;
91
197
    }
92
89
    if (flag != UNASSIGNED)
93
63
    {
94
63
      left_right_link[i] = flag;
95
63
      right_left_link[flag] = i;
96
63
    }
97
89
  }
98
14
  // If still have unmatched, they don't share the same loop.
99
77
  for (i = 0; i < left_block->loop_count; 
i++63
)
100
68
    if (left_right_link[i] == UNASSIGNED)
101
5
      return 0;
102
72
  
for (i = 0; 9
i < right_block->loop_count;
i++63
)
103
63
    if (right_left_link[i] == UNASSIGNED)
104
0
      return 0;
105
9
  // I don't want to deal with constant loop, hence, if other than the outer-most is a constant loop (0..<1),
106
9
  // we cannot merge.
107
63
  
for (i = 1; 9
i < left_block->loop_count;
i++54
)
108
54
    if (left_right_link[i] == ONE)
109
0
      return 0;
110
63
  
for (i = 1; 9
i < right_block->loop_count;
i++54
)
111
54
    if (right_left_link[i] == ONE)
112
0
      return 0;
113
9
  assert((left_block->loop_count == right_block->loop_count) ||
114
9
      (left_block->loop_count == right_block->loop_count + 1) ||
115
9
      (left_block->loop_count + 1 == right_block->loop_count));
116
9
  // The loop matches, but the ordering probably doesn't. We reorder loop based on statements.
117
9
  // Hence, two loops can only merge if using the statements as a pivot point and they can still
118
9
  // match things before / after the statement.
119
9
  // If both have statements, check if order preserving within the statement loop (we can be fancier
120
9
  // and recursively call this while using statement as pivoting point, but that is too much to my taste).
121
9
  const int left_start_idx = left_right_link[0] == ONE ? 
10
: 0;
122
9
  const int right_start_idx = right_left_link[0] == ONE ? 
10
: 0;
123
72
  for (i = 0; i < left_block->loop_count; 
i++63
)
124
63
    left_loop_idx[i] = UNASSIGNED;
125
72
  for (i = 0; i < right_block->loop_count; 
i++63
)
126
63
    right_loop_idx[i] = UNASSIGNED;
127
9
  if (left_start_idx == 1)
128
0
    left_loop_idx[0] = 0; // Assign their index.
129
9
  if (right_start_idx == 0)
130
9
    right_loop_idx[0] = 0; // Assign their index.
131
9
  const int end_idx = left_right_link[0] == ONE && 
right_left_link[0] == ONE0
?
left_block->loop_count - 10
: ccv_min(left_block->loop_count, right_block->loop_count);
132
9
  int pivot_idx = end_idx;
133
9
  int k;
134
72
  for (i = end_idx - 1; i >= 0; 
i--63
)
135
63
  {
136
63
    if (left_block->loops[i + left_start_idx].statement_count > 0)
137
9
    {
138
9
      for (j = i + 1, k = i + 1; j < end_idx; 
j++0
)
139
0
        if (left_loop_idx[j + left_start_idx] == UNASSIGNED)
140
0
        {
141
0
          left_loop_idx[j + left_start_idx] = k + left_start_idx;
142
0
          // If the right one can be referenced pass previous pivot_idx, it is not right.
143
0
          if (left_right_link[j + left_start_idx] >= pivot_idx + right_start_idx)
144
0
            return 0;
145
0
          right_loop_idx[left_right_link[j + left_start_idx]] = k + right_start_idx;
146
0
          ++k;
147
0
          if (k > pivot_idx)
148
0
            return 0;
149
0
        }
150
9
      assert(k == pivot_idx);
151
9
      pivot_idx = i + 1;
152
9
    }
153
63
    if (right_block->loops[i + right_start_idx].statement_count > 0)
154
10
    {
155
13
      for (j = i + 1, k = i + 1; j < end_idx; 
j++3
)
156
3
        if (right_loop_idx[j + left_start_idx] == UNASSIGNED)
157
3
        {
158
3
          right_loop_idx[j + right_start_idx] = k + right_start_idx;
159
3
          // If the left one can be referenced pass previous pivot_idx, it is not right.
160
3
          if (right_left_link[j + right_start_idx] >= pivot_idx + left_start_idx)
161
0
            return 0;
162
3
          left_loop_idx[right_left_link[j + right_start_idx]] = k + left_start_idx;
163
3
          ++k;
164
3
          if (k > pivot_idx)
165
0
            return 0;
166
3
        }
167
10
      assert(k == pivot_idx);
168
10
      pivot_idx = i + 1;
169
10
    }
170
63
  }
171
9
  if (end_idx == 0)
172
0
    return 1;
173
9
  // Finally, to distribute the rest.
174
72
  
for (j = 0, k = 0; 9
j < end_idx;
j++63
)
175
63
  {
176
63
    if (left_loop_idx[j + left_start_idx] == UNASSIGNED)
177
60
    {
178
60
      left_loop_idx[j + left_start_idx] = k + left_start_idx;
179
60
      // If the right one can be referenced pass previous pivot_idx, it is not right.
180
60
      if (left_right_link[j + left_start_idx] >= pivot_idx + right_start_idx)
181
0
        return 0;
182
60
      right_loop_idx[left_right_link[j + left_start_idx]] = k + right_start_idx;
183
60
      ++k;
184
60
      if (k > pivot_idx)
185
0
        return 0;
186
60
    }
187
63
  }
188
9
  assert(k == pivot_idx);
189
9
  return 1;
190
9
}
191
192
static void _ccv_nnc_loop_order_by(ccv_nnc_micro_loop_block_t* const block, int* const loop_idx, ccv_nnc_micro_loop_t* const loops)
193
18
{
194
18
  int i;
195
144
  for (i = 0; i < block->loop_count; 
i++126
)
196
126
    if (loop_idx[i] >= 0)
197
126
      loops[loop_idx[i]] = block->loops[i];
198
0
    else
199
0
      loops[i] = block->loops[i];
200
144
  for (i = 0; i < block->loop_count; 
i++126
)
201
126
  {
202
126
    // Essentially, we don't need to move statements, loop-carried variables, just the loop id and the start / end index.
203
126
    block->loops[i].id = loops[i].id;
204
126
    block->loops[i].start_index = loops[i].start_index;
205
126
    block->loops[i].end_index = loops[i].end_index;
206
126
  }
207
18
}
208
209
static void _ccv_nnc_expression_rename_carrieds(ccv_nnc_micro_loop_expression_t* const expression, const int start_idx)
210
4
{
211
4
  switch (expression->type)
212
4
  {
213
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_ID:
214
0
      assert(expression->id.type == CCV_NNC_MICRO_LOOP_CARRIED_ID);
215
0
      expression->id.id += start_idx;
216
0
      break;
217
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_TERNAY:
218
0
      _ccv_nnc_expression_rename_carrieds(expression->ternary.pivot, start_idx);
219
0
      _ccv_nnc_expression_rename_carrieds(expression->ternary.left, start_idx);
220
0
      _ccv_nnc_expression_rename_carrieds(expression->ternary.right, start_idx);
221
0
      break;
222
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_BINARY:
223
0
      _ccv_nnc_expression_rename_carrieds(expression->binary.left, start_idx);
224
0
      _ccv_nnc_expression_rename_carrieds(expression->binary.right, start_idx);
225
0
      break;
226
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_UNARY:
227
0
      _ccv_nnc_expression_rename_carrieds(expression->unary.x, start_idx);
228
0
      break;
229
0
    // We don't need to care about other expressions because loop-carried variable cannot participate these operations.
230
4
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAR:
231
4
      break;
232
4
  }
233
4
}
234
235
static void _ccv_nnc_loop_rename_carrieds(ccv_nnc_micro_loop_block_t* const block, const int start_idx)
236
4
{
237
4
  int i, j;
238
4
  const int loop_count = block->loop_count;
239
4
  ccv_nnc_micro_loop_t* const loops = block->loops;
240
32
  for (i = 0; i < loop_count; 
i++28
)
241
28
  {
242
28
    for (j = 0; j < loops[i].carried_count; 
j++0
)
243
0
      loops[i].carrieds[j].id.id += start_idx;
244
32
    for (j = 0; j < loops[i].statement_count; 
j++4
)
245
4
      switch (loops[i].statements[j].type)
246
4
      {
247
2
        case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_ASSIGNMENT:
248
2
          _ccv_nnc_expression_rename_carrieds(&loops[i].statements[j].compound_assignment.rvalue, start_idx);
249
2
          break;
250
2
        case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_COMPOUND_ASSIGNMENT:
251
2
          if (loops[i].statements[j].compound_assignment.lvalue.type == CCV_NNC_MICRO_LOOP_EXPR_TYPE_ID)
252
0
          {
253
0
            assert(loops[i].statements[j].compound_assignment.lvalue.id.type == CCV_NNC_MICRO_LOOP_CARRIED_ID);
254
0
            loops[i].statements[j].compound_assignment.lvalue.id.id += start_idx;
255
0
          }
256
2
          _ccv_nnc_expression_rename_carrieds(&loops[i].statements[j].compound_assignment.rvalue, start_idx);
257
2
          break;
258
4
      }
259
28
  }
260
4
}
261
262
static int _ccv_nnc_only_var_in_expression(const int id, const ccv_nnc_micro_loop_variable_t var, const ccv_nnc_micro_loop_expression_t* const expression, const int* const groups)
263
109
{
264
109
  switch (expression->type)
265
109
  {
266
72
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAR:
267
72
      if (expression->variable.id.type == CCV_NNC_MICRO_TENSOR_ID && expression->variable.id.id == id)
268
9
      {
269
9
        if (var.index_count != expression->variable.index_count)
270
0
          return 2;
271
9
        int i;
272
72
        for (i = 0; i < var.index_count; 
i++63
)
273
63
          if (!_ccv_nnc_same_index_term(var.index[i], expression->variable.index[i], groups))
274
0
            return 2;
275
9
        return 1;
276
63
      } else
277
63
        return 0;
278
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_TERNAY: {
279
0
      const int pivot = _ccv_nnc_only_var_in_expression(id, var, expression->ternary.pivot, groups);
280
0
      const int left = _ccv_nnc_only_var_in_expression(id, var, expression->ternary.left, groups);
281
0
      const int right = _ccv_nnc_only_var_in_expression(id, var, expression->ternary.right, groups);
282
0
      if (pivot == 2 || left == 2 || right == 2)
283
0
        return 2;
284
0
      return (pivot || left || right);
285
0
    }
286
20
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_BINARY: {
287
20
      const int left = _ccv_nnc_only_var_in_expression(id, var, expression->binary.left, groups);
288
20
      const int right = _ccv_nnc_only_var_in_expression(id, var, expression->binary.right, groups);
289
20
      if (left == 2 || right == 2)
290
0
        return 2;
291
20
      return (left || 
right17
);
292
20
    }
293
20
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_UNARY:
294
0
      return _ccv_nnc_only_var_in_expression(id, var, expression->unary.x, groups);
295
20
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_ID:
296
3
      assert(expression->id.type == CCV_NNC_MICRO_LOOP_CARRIED_ID);
297
3
      return 0;
298
14
  }
299
14
  return 0;
300
14
}
301
302
static int _ccv_nnc_only_var_in_rvalue(const int id, const ccv_nnc_micro_loop_variable_t var, const ccv_nnc_micro_loop_statement_t statement, const int* const groups)
303
69
{
304
69
  switch (statement.type)
305
69
  {
306
52
    case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_ASSIGNMENT:
307
52
      return _ccv_nnc_only_var_in_expression(id, var, &statement.assignment.rvalue, groups);
308
17
    case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_COMPOUND_ASSIGNMENT:
309
17
      return _ccv_nnc_only_var_in_expression(id, var, &statement.compound_assignment.rvalue, groups);
310
0
  }
311
0
  return 0;
312
0
}
313
314
static ccv_nnc_micro_loop_expression_t _ccv_nnc_expression_deep_copy(const ccv_nnc_micro_loop_expression_t* const expression)
315
1
{
316
1
  switch (expression->type)
317
1
  {
318
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_TERNAY: {
319
0
      ccv_nnc_micro_loop_expression_t copy = *expression;
320
0
      copy.ternary.pivot = (ccv_nnc_micro_loop_expression_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_expression_t));
321
0
      *copy.ternary.pivot = _ccv_nnc_expression_deep_copy(expression->ternary.pivot);
322
0
      copy.ternary.left = (ccv_nnc_micro_loop_expression_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_expression_t));
323
0
      *copy.ternary.left = _ccv_nnc_expression_deep_copy(expression->ternary.left);
324
0
      copy.ternary.right = (ccv_nnc_micro_loop_expression_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_expression_t));
325
0
      *copy.ternary.right = _ccv_nnc_expression_deep_copy(expression->ternary.right);
326
0
      return copy;
327
0
    }
328
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_BINARY: {
329
0
      ccv_nnc_micro_loop_expression_t copy = *expression;
330
0
      copy.binary.left = (ccv_nnc_micro_loop_expression_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_expression_t));
331
0
      *copy.binary.left = _ccv_nnc_expression_deep_copy(expression->binary.left);
332
0
      copy.binary.right = (ccv_nnc_micro_loop_expression_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_expression_t));
333
0
      *copy.binary.right = _ccv_nnc_expression_deep_copy(expression->binary.right);
334
0
      return copy;
335
0
    }
336
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_UNARY: {
337
0
      ccv_nnc_micro_loop_expression_t copy = *expression;
338
0
      copy.unary.x = (ccv_nnc_micro_loop_expression_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_expression_t));
339
0
      *copy.unary.x = _ccv_nnc_expression_deep_copy(expression->unary.x);
340
0
      return copy;
341
0
    }
342
1
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAR: {
343
1
      ccv_nnc_micro_loop_expression_t copy = *expression;
344
1
      int i;
345
8
      for (i = 0; i < copy.variable.index_count; 
i++7
)
346
7
        copy.variable.index[i] = ccv_nnc_micro_loop_index_deep_copy(&copy.variable.index[i]);
347
1
      return copy;
348
0
    }
349
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_ID:
350
0
      return *expression;
351
0
  }
352
0
  return *expression;
353
0
}
354
355
static void _ccv_nnc_replacing_id_in_expression(ccv_nnc_micro_loop_expression_t* const expression, const int id, ccv_nnc_micro_loop_expression_t rvalue, int* const count)
356
46
{
357
46
  switch (expression->type)
358
46
  {
359
36
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAR:
360
36
      if (expression->variable.id.type == CCV_NNC_MICRO_TENSOR_ID && expression->variable.id.id == id)
361
9
      {
362
9
        ccv_nnc_micro_loop_variable_free(&expression->variable);
363
9
        if (*count == 0) // First time, just assign to expression.
364
8
          *expression = rvalue;
365
1
        else // Otherwise, need to make deep copy of it.
366
1
          *expression = _ccv_nnc_expression_deep_copy(&rvalue);
367
9
        ++(*count);
368
9
      }
369
36
      break;
370
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_TERNAY:
371
0
      _ccv_nnc_replacing_id_in_expression(expression->ternary.pivot, id, rvalue, count);
372
0
      _ccv_nnc_replacing_id_in_expression(expression->ternary.left, id, rvalue, count);
373
0
      _ccv_nnc_replacing_id_in_expression(expression->ternary.right, id, rvalue, count);
374
0
      break;
375
10
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_BINARY:
376
10
      _ccv_nnc_replacing_id_in_expression(expression->binary.left, id, rvalue, count);
377
10
      _ccv_nnc_replacing_id_in_expression(expression->binary.right, id, rvalue, count);
378
10
      break;
379
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_UNARY:
380
0
      _ccv_nnc_replacing_id_in_expression(expression->unary.x, id, rvalue, count);
381
0
      break;
382
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_ID:
383
0
      assert(expression->id.type == CCV_NNC_MICRO_LOOP_CARRIED_ID);
384
0
      break;
385
46
  }
386
46
}
387
388
static void _ccv_nnc_replacing_id_in_rvalue(ccv_nnc_micro_loop_statement_t* const statement, const int id, ccv_nnc_micro_loop_expression_t rvalue, int* const count)
389
26
{
390
26
  switch (statement->type)
391
26
  {
392
13
    case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_ASSIGNMENT:
393
13
      _ccv_nnc_replacing_id_in_expression(&statement->assignment.rvalue, id, rvalue, count);
394
13
      break;
395
13
    case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_COMPOUND_ASSIGNMENT:
396
13
      // Not going to be in lvalue (which is the carried variable only).
397
13
      _ccv_nnc_replacing_id_in_expression(&statement->compound_assignment.rvalue, id, rvalue, count);
398
13
      break;
399
26
  }
400
26
}
401
402
typedef struct {
403
  int flag;
404
  int merge_to;
405
  ccv_array_t* writes;
406
  ccv_array_t* reads;
407
} ccv_nnc_micro_loop_block_dependency_t;
408
409
typedef struct {
410
  int flag;
411
  ccv_array_t* writes;
412
  ccv_array_t* reads;
413
} ccv_nnc_micro_tensor_dependency_t;
414
415
static void _ccv_nnc_micro_block_dependencies_from_rvalue(const ccv_nnc_micro_loop_expression_t* const rvalue, const int i, ccv_nnc_micro_loop_block_dependency_t* const block_dependencies, ccv_nnc_micro_tensor_dependency_t* const tensor_dependencies)
416
25
{
417
25
  switch (rvalue->type)
418
25
  {
419
17
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAR:
420
17
      if (rvalue->variable.id.type == CCV_NNC_MICRO_TENSOR_ID)
421
17
      {
422
17
        if (!block_dependencies[i].reads)
423
11
          block_dependencies[i].reads = ccv_array_new(sizeof(int), 1, 0);
424
17
        const int id = rvalue->variable.id.id;
425
17
        ccv_array_add_unique_int(block_dependencies[i].reads, id);
426
17
        if (!tensor_dependencies[id].reads)
427
14
          tensor_dependencies[id].reads = ccv_array_new(sizeof(int), 1, 0);
428
17
        ccv_array_add_unique_int(tensor_dependencies[id].reads, i);
429
17
      }
430
17
      break;
431
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_TERNAY:
432
0
      _ccv_nnc_micro_block_dependencies_from_rvalue(rvalue->ternary.pivot, i, block_dependencies, tensor_dependencies);
433
0
      _ccv_nnc_micro_block_dependencies_from_rvalue(rvalue->ternary.left, i, block_dependencies, tensor_dependencies);
434
0
      _ccv_nnc_micro_block_dependencies_from_rvalue(rvalue->ternary.right, i, block_dependencies, tensor_dependencies);
435
0
      break;
436
4
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_BINARY:
437
4
      _ccv_nnc_micro_block_dependencies_from_rvalue(rvalue->binary.left, i, block_dependencies, tensor_dependencies);
438
4
      _ccv_nnc_micro_block_dependencies_from_rvalue(rvalue->binary.right, i, block_dependencies, tensor_dependencies);
439
4
      break;
440
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_UNARY:
441
0
      _ccv_nnc_micro_block_dependencies_from_rvalue(rvalue->unary.x, i, block_dependencies, tensor_dependencies);
442
0
      break;
443
2
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_ID:
444
2
      assert(rvalue->id.type == CCV_NNC_MICRO_LOOP_CARRIED_ID);
445
2
      break;
446
25
  }
447
25
}
448
449
static void _ccv_nnc_micro_block_dependencies(const ccv_nnc_micro_loop_block_t* const blocks, const int block_size, const int var_count, ccv_nnc_micro_loop_block_dependency_t** const block_dependencies_ref, ccv_nnc_micro_tensor_dependency_t** const tensor_dependencies_ref)
450
2
{
451
2
  ccv_nnc_micro_loop_block_dependency_t* const block_dependencies = (ccv_nnc_micro_loop_block_dependency_t*)cccalloc(block_size, sizeof(ccv_nnc_micro_loop_block_dependency_t));
452
2
  ccv_nnc_micro_tensor_dependency_t* const tensor_dependencies = (ccv_nnc_micro_tensor_dependency_t*)cccalloc(var_count, sizeof(ccv_nnc_micro_tensor_dependency_t));
453
2
  int i, j, k;
454
17
  for (i = 0; i < block_size; 
i++15
)
455
15
  {
456
15
    block_dependencies[i].merge_to = i;
457
15
    const ccv_nnc_micro_loop_t* const loops = blocks[i].loops;
458
15
    const int loop_count = blocks[i].loop_count;
459
114
    for (j = 0; j < loop_count; 
j++99
)
460
99
    {
461
99
      const ccv_nnc_micro_loop_statement_t* const statements = loops[j].statements;
462
99
      const int statement_count = loops[j].statement_count;
463
116
      for (k = 0; k < statement_count; 
k++17
)
464
17
        switch (statements[k].type)
465
17
        {
466
13
          case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_ASSIGNMENT: {
467
13
            assert(statements[k].assignment.lvalue.id.type == CCV_NNC_MICRO_TENSOR_ID);
468
13
            const int id = statements[k].assignment.lvalue.id.id;
469
13
            if (!block_dependencies[i].writes)
470
13
              block_dependencies[i].writes = ccv_array_new(sizeof(int), 1, 0);
471
13
            ccv_array_add_unique_int(block_dependencies[i].writes, id);
472
13
            if (!tensor_dependencies[id].writes)
473
13
              tensor_dependencies[id].writes = ccv_array_new(sizeof(int), 1, 0);
474
13
            ccv_array_add_unique_int(tensor_dependencies[id].writes, i);
475
13
            _ccv_nnc_micro_block_dependencies_from_rvalue(&statements[k].assignment.rvalue, i, block_dependencies, tensor_dependencies);
476
13
            break;
477
13
          }
478
13
          case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_COMPOUND_ASSIGNMENT: {
479
4
            if (statements[k].compound_assignment.lvalue.type == CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAR)
480
2
            {
481
2
              assert(statements[k].compound_assignment.lvalue.id.type == CCV_NNC_MICRO_TENSOR_ID);
482
2
              const int id = statements[k].compound_assignment.lvalue.id.id;
483
2
              if (!block_dependencies[i].writes)
484
2
                block_dependencies[i].writes = ccv_array_new(sizeof(int), 1, 0);
485
2
              ccv_array_add_unique_int(block_dependencies[i].writes, id);
486
2
              if (!tensor_dependencies[id].writes)
487
0
                tensor_dependencies[id].writes = ccv_array_new(sizeof(int), 1, 0);
488
2
              ccv_array_add_unique_int(tensor_dependencies[id].writes, i);
489
2
              if (!block_dependencies[i].reads)
490
2
                block_dependencies[i].reads = ccv_array_new(sizeof(int), 1, 0);
491
2
              ccv_array_add_unique_int(block_dependencies[i].reads, id);
492
2
              if (!tensor_dependencies[id].reads)
493
2
                tensor_dependencies[id].reads = ccv_array_new(sizeof(int), 1, 0);
494
2
              ccv_array_add_unique_int(tensor_dependencies[id].reads, i);
495
2
            }
496
4
            _ccv_nnc_micro_block_dependencies_from_rvalue(&statements[k].compound_assignment.rvalue, i, block_dependencies, tensor_dependencies);
497
4
            break;
498
4
          }
499
17
        }
500
99
    }
501
15
  }
502
2
  *block_dependencies_ref = block_dependencies;
503
2
  *tensor_dependencies_ref = tensor_dependencies;
504
2
}
505
506
static void _ccv_nnc_micro_dependencies_free(ccv_nnc_micro_loop_block_dependency_t* const block_dependencies, const int block_size, ccv_nnc_micro_tensor_dependency_t* const tensor_dependencies, const int var_count)
507
2
{
508
2
  int i;
509
17
  for (i = 0; i < block_size; 
i++15
)
510
15
  {
511
15
    if (block_dependencies[i].writes)
512
15
      ccv_array_free(block_dependencies[i].writes);
513
15
    if (block_dependencies[i].reads)
514
13
      ccv_array_free(block_dependencies[i].reads);
515
15
  }
516
2
  ccfree(block_dependencies);
517
20
  for (i = 0; i < var_count; 
i++18
)
518
18
  {
519
18
    if (tensor_dependencies[i].writes)
520
13
      ccv_array_free(tensor_dependencies[i].writes);
521
18
    if (tensor_dependencies[i].reads)
522
16
      ccv_array_free(tensor_dependencies[i].reads);
523
18
  }
524
2
  ccfree(tensor_dependencies);
525
2
}
526
527
static int _ccv_nnc_tensor_reads_in_y_from_writes_after_x(const ccv_nnc_micro_loop_block_dependency_t* const block_dependencies, const ccv_nnc_micro_tensor_dependency_t* const tensor_dependencies, const int x, const int y)
528
8
{
529
8
  int i, j;
530
8
  int flag = 0;
531
19
  for (i = 0; !flag && 
i < block_dependencies[y].reads->rnum16
;
i++11
)
532
11
  {
533
11
    const int read_idx = *(int*)ccv_array_get(block_dependencies[y].reads, i);
534
11
    if (tensor_dependencies[read_idx].writes)
535
20
      
for (j = 0; 10
!flag &&
j < tensor_dependencies[read_idx].writes->rnum17
;
j++10
)
536
10
      {
537
10
        int block_idx = *(int*)ccv_array_get(tensor_dependencies[read_idx].writes, j);
538
15
        while (block_idx != block_dependencies[block_idx].merge_to)
539
5
          block_idx = block_dependencies[block_idx].merge_to;
540
10
        if (!block_dependencies[block_idx].flag) // Not in use, continue.
541
0
          continue;
542
10
        assert(block_idx <= y);
543
10
        // If the block_idx is between i and j (and not neither). We cannot merge.
544
10
        if (block_idx > x && 
block_idx != y3
)
545
3
          flag = block_idx;
546
10
      }
547
11
  }
548
8
  return flag;
549
8
}
550
551
static int _ccv_nnc_tensor_writes_in_x_reads_before_y(const ccv_nnc_micro_loop_block_dependency_t* const block_dependencies, const ccv_nnc_micro_tensor_dependency_t* const tensor_dependencies, const int x, const int y)
552
3
{
553
3
  int i, j;
554
3
  int flag = 0;
555
15
  for (i = 0; !flag && 
i < block_dependencies[x].writes->rnum14
;
i++12
)
556
12
  {
557
12
    const int write_idx = *(int*)ccv_array_get(block_dependencies[x].writes, i);
558
12
    if (tensor_dependencies[write_idx].reads)
559
30
      
for (j = 0; 12
!flag &&
j < tensor_dependencies[write_idx].reads->rnum29
;
j++18
)
560
18
      {
561
18
        int block_idx = *(int*)ccv_array_get(tensor_dependencies[write_idx].reads, j);
562
30
        while (block_idx != block_dependencies[block_idx].merge_to)
563
12
          block_idx = block_dependencies[block_idx].merge_to;
564
18
        if (!block_dependencies[block_idx].flag) // Not in use, continue.
565
4
          continue;
566
14
        assert(block_idx >= x);
567
14
        // If the block_idx is between i and j (and not neither). We cannot merge.
568
14
        if (block_idx < y && 
block_idx != x11
)
569
1
          flag = block_idx;
570
14
      }
571
12
  }
572
3
  return flag;
573
3
}
574
575
static void _ccv_nnc_tensor_remove_dead_store(const ccv_nnc_micro_tensor_dependency_t* const tensor_dependency, const int tensor_idx, ccv_array_t* const blocks)
576
2
{
577
2
  int i, j, k, l;;
578
2
  if (tensor_dependency->writes)
579
4
    
for (i = 0; 2
i < tensor_dependency->writes->rnum;
i++2
)
580
2
    {
581
2
      const int write_idx = *(int*)ccv_array_get(tensor_dependency->writes, i);
582
2
      ccv_nnc_micro_loop_block_t* const block = (ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, write_idx);
583
2
      int flag = 0;
584
2
      for (j = 0; j < block->loop_count; 
j++0
)
585
0
      {
586
0
        ccv_nnc_micro_loop_statement_t* const statements = block->loops[j].statements;
587
0
        for (k = 0, l = 0; k < block->loops[j].statement_count; k++)
588
0
        {
589
0
          // It cannot be compound assignment, in this case, this tensor will be in read, and
590
0
          // it will be in active use (anything "read" in an active block will be marked as in use).
591
0
          assert(!(statements[k].type == CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_COMPOUND_ASSIGNMENT &&
592
0
            statements[k].compound_assignment.lvalue.id.type == CCV_NNC_MICRO_TENSOR_ID &&
593
0
            statements[k].compound_assignment.lvalue.id.id == tensor_idx));
594
0
          if (statements[k].type == CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_ASSIGNMENT &&
595
0
            statements[k].assignment.lvalue.id.type == CCV_NNC_MICRO_TENSOR_ID &&
596
0
            statements[k].assignment.lvalue.id.id == tensor_idx)
597
0
          {
598
0
            // This is a dead store, prepare to remove.
599
0
            ccv_nnc_micro_loop_statement_free(&statements[k]);
600
0
          } else {
601
0
            statements[l] = statements[k];
602
0
            ++l;
603
0
          }
604
0
        }
605
0
        if (l < block->loops[j].statement_count)
606
0
        {
607
0
          if (l > 0)
608
0
            block->loops[j].statements = (ccv_nnc_micro_loop_statement_t*)ccrealloc(block->loops[j].statements, sizeof(ccv_nnc_micro_loop_statement_t) * l);
609
0
          else {
610
0
            ccfree(block->loops[j].statements);
611
0
            block->loops[j].statements = 0;
612
0
          }
613
0
          block->loops[j].statement_count = 0;
614
0
        }
615
0
        if (block->loops[j].statement_count > 0)
616
0
          flag = 1;
617
0
      }
618
2
      if (!flag) // No statement for this block, remove this whole block.
619
2
      {
620
2
        ccv_nnc_micro_loops_free(block->loops, block->loop_count);
621
2
        ccfree(block->loops);
622
2
        block->loops = 0;
623
2
        block->loop_count = 0;
624
2
      }
625
2
    }
626
2
}
627
628
static void _ccv_nnc_loop_merging(ccv_nnc_micro_loop_block_dependency_t* const block_dependencies, const ccv_nnc_micro_tensor_dependency_t* const tensor_dependencies, ccv_array_t* const blocks, const int max_loop_count, const int* const groups)
629
2
{
630
2
  int i, j;
631
2
  int left_loop_idx[max_loop_count];
632
2
  int right_loop_idx[max_loop_count];
633
2
  ccv_nnc_micro_loop_t loops[max_loop_count];
634
2
  // Merge loops from blocks.
635
15
  for (i = 0; i < blocks->rnum - 1; 
i++13
)
636
13
  {
637
13
    ccv_nnc_micro_loop_block_t* const left_block = (ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, i);
638
13
    if (left_block->loop_count == 0)
639
8
      continue;
640
20
    
for (j = i + 1; 5
j < blocks->rnum;
j++15
)
641
17
    {
642
17
      // We always merge from right block to left block. Thus, the right block will always be
643
17
      // in the original form.
644
17
      ccv_nnc_micro_loop_block_t* const right_block = (ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, j);
645
17
      if (right_block->loop_count == 0)
646
2
        continue;
647
15
      int merge_to_right = 0;
648
15
      // First check whether between left and right, there are any blocks that the right block
649
15
      // depends on. If that is the case, we cannot merge the right block into the left block.
650
15
      if (j > i + 1 && 
block_dependencies[j].reads10
)
651
8
      {
652
8
        const int block_idx = _ccv_nnc_tensor_reads_in_y_from_writes_after_x(block_dependencies, tensor_dependencies, i, j);
653
8
        // Cannot merge because we have dependencies in between. Merging will violate that
654
8
        // dependency relationship.
655
8
        if (block_idx)
656
3
        {
657
3
          // Now check to see if left can be merged into right? If so, we are lucky.
658
3
          if (_ccv_nnc_tensor_writes_in_x_reads_before_y(block_dependencies, tensor_dependencies, i, j))
659
1
            continue;
660
2
          merge_to_right = 1;
661
2
        }
662
8
      }
663
15
      // This method not only compares whether they have the same loop or not, but also gives indexes that
664
15
      // to match the loop start / end index, where they should move to. For example, if:
665
15
      // left_loop_idx[2] = 3
666
15
      // right_loop_idx[0] = 3
667
15
      // That means right now, loop at index 2 on the left is the same as loop at index 0 on the right.
668
15
      // And to match exactly, they both need to move to index 3.
669
15
      
if (14
_ccv_nnc_same_loop(left_block, right_block, groups, left_loop_idx, right_loop_idx)14
)
670
9
      {
671
9
        // Make sure if we have extra loop, they are on the left.
672
9
        if (right_block->loop_count > left_block->loop_count)
673
0
        {
674
0
          ccv_nnc_micro_loop_block_t t;
675
0
          CCV_SWAP(*left_block, *right_block, t);
676
0
        }
677
9
        assert(left_block->loop_count == right_block->loop_count || left_block->loop_count == right_block->loop_count + 1);
678
9
        _ccv_nnc_loop_order_by(left_block, left_loop_idx, loops);
679
9
        _ccv_nnc_loop_order_by(right_block, right_loop_idx, loops);
680
9
        const int left_start_idx = left_block->loop_count - right_block->loop_count;
681
9
        if (left_block->carried_count > 0)
682
4
          _ccv_nnc_loop_rename_carrieds(right_block, left_block->carried_count);
683
9
        left_block->carried_count += right_block->carried_count;
684
9
        int k;
685
72
        for (k = 0; k < right_block->loop_count; 
k++63
) // Merge loops.
686
63
        {
687
63
          const int left_idx = left_start_idx + k;
688
63
          if (right_block->loops[k].carried_count > 0)
689
1
          {
690
1
            if (left_block->loops[left_idx].carried_count > 0)
691
0
            {
692
0
              left_block->loops[left_idx].carrieds = (ccv_nnc_micro_loop_carried_t*)ccrealloc(left_block->loops[left_idx].carrieds, sizeof(ccv_nnc_micro_loop_carried_t) * (left_block->loops[left_idx].carried_count + right_block->loops[k].carried_count));
693
0
              memcpy(left_block->loops[left_idx].carrieds + left_block->loops[left_idx].carried_count, right_block->loops[k].carrieds, sizeof(ccv_nnc_micro_loop_carried_t) * right_block->loops[k].carried_count);
694
0
              ccfree(right_block->loops[k].carrieds);
695
0
            } else
696
1
              left_block->loops[left_idx].carrieds = right_block->loops[k].carrieds;
697
1
            left_block->loops[left_idx].carried_count += right_block->loops[k].carried_count;
698
1
            right_block->loops[k].carrieds = 0;
699
1
            right_block->loops[k].carried_count = 0;
700
1
          }
701
63
          if (right_block->loops[k].statement_count > 0)
702
10
          {
703
10
            if (left_block->loops[left_idx].statement_count > 0)
704
9
            {
705
9
              left_block->loops[left_idx].statements = (ccv_nnc_micro_loop_statement_t*)ccrealloc(left_block->loops[left_idx].statements, sizeof(ccv_nnc_micro_loop_statement_t) * (left_block->loops[left_idx].statement_count + right_block->loops[k].statement_count));
706
9
              memcpy(left_block->loops[left_idx].statements + left_block->loops[left_idx].statement_count, right_block->loops[k].statements, sizeof(ccv_nnc_micro_loop_statement_t) * right_block->loops[k].statement_count);
707
9
              ccfree(right_block->loops[k].statements);
708
9
            } else
709
1
              left_block->loops[left_idx].statements = right_block->loops[k].statements;
710
10
            left_block->loops[left_idx].statement_count += right_block->loops[k].statement_count;
711
10
            right_block->loops[k].statements = 0;
712
10
            right_block->loops[k].statement_count = 0;
713
10
          }
714
63
        }
715
9
        // Once merged, free the loop.
716
9
        ccfree(right_block->loops);
717
9
        right_block->loops = 0;
718
9
        right_block->loop_count = 0;
719
9
        int x = i, y = j;
720
9
        if (merge_to_right) // If this is merge to right.
721
2
        {
722
2
          ccv_nnc_micro_loop_block_t t;
723
2
          CCV_SWAP(*left_block, *right_block, t);
724
2
          x = j, y = i;
725
2
        }
726
9
        // Merge all reads and writes tensors into block dependency.
727
9
        if (block_dependencies[y].writes && block_dependencies[y].writes->rnum)
728
9
        {
729
9
          if (!block_dependencies[x].writes)
730
0
            block_dependencies[x].writes = ccv_array_new(sizeof(int), 1, 0);
731
27
          for (k = 0; k < block_dependencies[y].writes->rnum; 
k++18
)
732
18
            ccv_array_push(block_dependencies[x].writes, ccv_array_get(block_dependencies[y].writes, k));
733
9
        }
734
9
        if (block_dependencies[y].reads && block_dependencies[y].reads->rnum)
735
9
        {
736
9
          if (!block_dependencies[x].reads)
737
0
            block_dependencies[x].reads = ccv_array_new(sizeof(int), 1, 0);
738
35
          for (k = 0; k < block_dependencies[y].reads->rnum; 
k++26
)
739
26
            ccv_array_push(block_dependencies[x].reads, ccv_array_get(block_dependencies[y].reads, k));
740
9
        }
741
9
        // Merged, mark the proper merging dependency.
742
9
        block_dependencies[y].merge_to = x;
743
9
        if (merge_to_right) // If this is merge to right, now left is empty, break.
744
2
          break;
745
9
      }
746
14
    }
747
5
  }
748
2
}
749
750
static void _ccv_nnc_var_subst(ccv_nnc_micro_tensor_t* const vars, const int var_count, const ccv_nnc_micro_io_t* const inputs, const int input_size, const ccv_nnc_micro_io_t* const outputs, const int output_size, ccv_array_t* const blocks, const int* const groups)
751
2
{
752
2
  int i, j;
753
2
  // These are simple programs, so we are going to loop over all blocks to see whether a non-output-input
754
2
  // var only write / read in one loop. If that is the case, we are going to remove that var.
755
2
  // We have to do this replacement from bottom to top though.
756
20
  for (i = 0; i < var_count; 
i++18
)
757
18
  {
758
18
    int flag = 0;
759
62
    for (j = 0; !flag && 
j < input_size57
;
j++44
)
760
44
      flag = (inputs[j]->id == i);
761
39
    for (j = 0; !flag && 
j < output_size31
;
j++21
)
762
21
      flag = (outputs[j]->id == i);
763
18
    if (flag) // This is in outputs or inputs.
764
8
      continue;
765
10
    int count_var = 0;
766
10
    ccv_nnc_micro_loop_variable_t lvalue;
767
10
    ccv_nnc_micro_loop_expression_t rvalue;
768
10
    int block_idx, loop_idx, statement_idx;
769
34
    for (j = 0; j < blocks->rnum; 
j++24
)
770
24
    {
771
24
      const ccv_nnc_micro_loop_block_t* const block = (ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, j);
772
24
      int k, l;
773
24
      const int loop_count = block->loop_count;
774
24
      const ccv_nnc_micro_loop_t* const loops = block->loops;
775
24
      int var_per_block = 0;
776
150
      for (k = 0; k < loop_count; 
k++126
)
777
126
      {
778
126
        int flag = 0;
779
126
        const int statement_count = loops[k].statement_count;
780
126
        ccv_nnc_micro_loop_statement_t* const statements = loops[k].statements;
781
187
        for (l = 0; l < statement_count; 
l++61
)
782
61
          if (statements[l].type == CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_ASSIGNMENT &&
783
61
            
statements[l].assignment.lvalue.id.type == CCV_NNC_MICRO_TENSOR_ID44
&&
784
61
            
statements[l].assignment.lvalue.id.id == i44
)
785
8
          {
786
8
            lvalue = statements[l].assignment.lvalue;
787
8
            if (_ccv_nnc_only_var_in_rvalue(i, lvalue, statements[l], groups))
788
0
              flag = 2;
789
8
            else {
790
8
              // If the variable not showing up on the right-side, we can continue.
791
8
              rvalue = statements[l].assignment.rvalue;
792
8
              block_idx = j;
793
8
              loop_idx = k;
794
8
              statement_idx = l;
795
8
              ++flag;
796
8
            }
797
53
          } else if (statements[l].type == CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_COMPOUND_ASSIGNMENT &&
798
53
            
statements[l].compound_assignment.lvalue.id.type == CCV_NNC_MICRO_TENSOR_ID17
&&
799
53
            
statements[l].compound_assignment.lvalue.id.id == i14
) {
800
0
            // This is compound assignment, automatically increase by 2.
801
0
            flag += 2;
802
0
          }
803
126
        if (flag > 1) // We have more than 1 assignment for this id, it is not good. We cannot remove it.
804
0
        {
805
0
          var_per_block += flag;
806
0
          continue;
807
0
        }
808
187
        
for (l = 0; 126
l < statement_count;
l++61
)
809
61
          flag = ccv_max(flag, _ccv_nnc_only_var_in_rvalue(i, lvalue, statements[l], groups));
810
126
        // If flag == 2, meaning it found a var with a different index. This is a bad news.
811
126
        var_per_block += flag;
812
126
      }
813
24
      count_var += var_per_block;
814
24
    }
815
10
    // If this is used more than one place (write multiple times, have different index, or used in different blocks),
816
10
    // I cannot get rid of it.
817
10
    if (count_var != 1)
818
2
      continue;
819
8
    // Otherwise, now loop again and prepare to get rid of it.
820
8
    ccv_nnc_micro_loop_block_t* const block = (ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, block_idx);
821
8
    ccv_nnc_micro_loop_statement_t* statements = block->loops[loop_idx].statements;
822
8
    ccv_nnc_micro_loop_statement_t statement = statements[statement_idx];
823
8
    // First, remove the assignment.
824
8
    if (statement_idx < block->loops[loop_idx].statement_count - 1)
825
8
      memmove(statements + statement_idx, statements + statement_idx + 1, sizeof(ccv_nnc_micro_loop_statement_t) * (block->loops[loop_idx].statement_count - statement_idx - 1));
826
8
    --block->loops[loop_idx].statement_count;
827
8
    const int statement_count = block->loops[loop_idx].statement_count;
828
8
    statements = block->loops[loop_idx].statements = (ccv_nnc_micro_loop_statement_t*)ccrealloc(statements, sizeof(ccv_nnc_micro_loop_statement_t) * statement_count);
829
8
    int k = 0;
830
34
    for (j = 0; j < statement_count; 
j++26
)
831
26
      _ccv_nnc_replacing_id_in_rvalue(&statements[j], i, rvalue, &k);
832
8
    if (k == 0) // If nothing to replace, free up everything.
833
0
      ccv_nnc_micro_loop_statement_free(&statement);
834
8
    else
835
8
      ccv_nnc_micro_loop_statement_lvalue_free(&statement);
836
8
    // No need to allocate for this var. It is not used, only useful for shape computation.
837
8
    vars[i].no_alloc = 1;
838
8
  }
839
2
}
840
841
static int _ccv_nnc_index_check_axis_size(const ccv_nnc_micro_loop_index_term_t index, int* const axis_size_ref, int* const d_ref, const int* const groups)
842
84
{
843
84
  switch (index.type)
844
84
  {
845
0
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_NONE:
846
0
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL:
847
0
      return 1;
848
0
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY:
849
0
      return _ccv_nnc_index_check_axis_size(index.binary->left, axis_size_ref, d_ref, groups) && _ccv_nnc_index_check_axis_size(index.binary->left, axis_size_ref, d_ref, groups);
850
84
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_ID: {
851
84
      if (index.id.type == CCV_NNC_MICRO_AXIS_SIZE_ID)
852
84
      {
853
84
        int root = groups[index.id.id];
854
84
        while (groups[root] != root)
855
0
          root = groups[root];
856
84
        if (*axis_size_ref == -1)
857
42
          *axis_size_ref = root;
858
84
        if (*d_ref == -1)
859
42
          *d_ref = index.id.d;
860
84
        return root == *axis_size_ref && 
index.id.d == *d_ref54
;
861
84
      }
862
0
      return 1;
863
0
    }
864
0
  }
865
0
  return 0;
866
0
}
867
868
static int _ccv_nnc_index_binary_size(const ccv_nnc_micro_loop_index_term_t index)
869
22
{
870
22
  switch (index.type)
871
22
  {
872
0
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_NONE:
873
0
      return 0;
874
22
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL:
875
22
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_ID:
876
22
      return 1;
877
22
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY:
878
0
      if (index.binary->op == CCV_NNC_MICRO_BINARY_OP_PLUS || index.binary->op == CCV_NNC_MICRO_BINARY_OP_MINUS)
879
0
        return _ccv_nnc_index_binary_size(index.binary->left) + _ccv_nnc_index_binary_size(index.binary->right);
880
0
      else
881
0
        return 1;
882
0
  }
883
0
  return 0;
884
0
}
885
886
typedef struct {
887
  int sign:7;
888
  int ignore:1;
889
  ccv_nnc_micro_loop_index_term_t term;
890
} ccv_nnc_micro_loop_binary_term_t;
891
892
static void _ccv_nnc_index_term_flatten(ccv_nnc_micro_loop_binary_term_t* const binary_terms, const ccv_nnc_micro_loop_index_term_t index, const int sign, int* const i)
893
22
{
894
22
  switch (index.type)
895
22
  {
896
0
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_NONE: // No need to occupy.
897
0
      break;
898
22
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL:
899
22
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_ID:
900
22
      binary_terms[*i].term = index;
901
22
      binary_terms[*i].sign = sign;
902
22
      binary_terms[*i].ignore = 0;
903
22
      ++(*i);
904
22
      break;
905
22
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY:
906
0
      if (index.binary->op == CCV_NNC_MICRO_BINARY_OP_PLUS || index.binary->op == CCV_NNC_MICRO_BINARY_OP_MINUS)
907
0
      {
908
0
        _ccv_nnc_index_term_flatten(binary_terms, index.binary->left, sign, i);
909
0
        if (index.binary->op == CCV_NNC_MICRO_BINARY_OP_MINUS) // Switch sign.
910
0
          _ccv_nnc_index_term_flatten(binary_terms, index.binary->right, sign == CCV_NNC_MICRO_BINARY_OP_PLUS ? CCV_NNC_MICRO_BINARY_OP_MINUS : CCV_NNC_MICRO_BINARY_OP_PLUS, i);
911
0
        else
912
0
          _ccv_nnc_index_term_flatten(binary_terms, index.binary->right, sign, i);
913
0
      } else {
914
0
        binary_terms[*i].term = index;
915
0
        binary_terms[*i].sign = sign;
916
0
        binary_terms[*i].ignore = 0;
917
0
        ++(*i);
918
0
      }
919
0
      break;
920
22
  }
921
22
}
922
923
// 0 is we don't understand, -1 is false, 1 is true.
924
static int _ccv_nnc_index_less_than_or_equal_to(const ccv_nnc_micro_loop_index_term_t left, const ccv_nnc_micro_loop_index_term_t right, const ccv_nnc_micro_tensor_t* const vars, const int var_count, const int* const groups)
925
80
{
926
80
  // Special case 1.
927
80
  if (left.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL && 
right.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL38
)
928
38
    return left.immediate_value <= right.immediate_value ? 1 : 
-10
;
929
42
  // Special case 2.
930
42
  if (left.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL && 
left.immediate_value == 00
&&
right.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_ID0
&&
right.id.type == CCV_NNC_MICRO_AXIS_SIZE_ID0
)
931
0
    return 1;
932
42
  // Special case 3.
933
42
  if (left.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_ID && left.id.type == CCV_NNC_MICRO_AXIS_SIZE_ID && right.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL && 
right.immediate_value == 00
)
934
0
    return -1;
935
42
  // Special case 4. We cannot understand this.
936
42
  if (right.type == CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL)
937
0
    return 0;
938
42
  // We only have either left and right are ID or BINARY. However, we cannot make any progress if it is referenced to more than 1 axis_size & d.
939
42
  int axis_size = -1, d = -1;
940
42
  if (!(_ccv_nnc_index_check_axis_size(left, &axis_size, &d, groups) && _ccv_nnc_index_check_axis_size(right, &axis_size, &d, groups)))
941
31
    return 0;
942
11
  // Now, we only have one variable in both left and right, need to flat the binary tree (if possible) and reduce it to constant if possible.
943
11
  // We can only flatten if it is + / - at the moment.
944
11
  const int left_binary_size = _ccv_nnc_index_binary_size(left);
945
11
  assert(left_binary_size >= 1);
946
11
  const int right_binary_size = _ccv_nnc_index_binary_size(right);
947
11
  assert(right_binary_size >= 1);
948
11
  ccv_nnc_micro_loop_binary_term_t* const left_binary_terms = (ccv_nnc_micro_loop_binary_term_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_binary_term_t) * (left_binary_size + right_binary_size));
949
11
  ccv_nnc_micro_loop_binary_term_t* const right_binary_terms = left_binary_terms + left_binary_size;
950
11
  int i, j;
951
11
  i = 0;
952
11
  _ccv_nnc_index_term_flatten(left_binary_terms, left, CCV_NNC_MICRO_BINARY_OP_PLUS, &i);
953
11
  assert(i == left_binary_size);
954
11
  i = 0;
955
11
  _ccv_nnc_index_term_flatten(right_binary_terms, right, CCV_NNC_MICRO_BINARY_OP_PLUS, &i);
956
11
  assert(i == right_binary_size);
957
22
  
for (i = 0; 11
i < left_binary_size;
i++11
)
958
22
    
for (j = 0; 11
j < right_binary_size;
j++11
)
959
11
      // If they are the same, we can ignore now.
960
11
      if (!left_binary_terms[i].ignore && !right_binary_terms[j].ignore &&
961
11
        _ccv_nnc_same_index_term(left_binary_terms[i].term, right_binary_terms[j].term, groups) &&
962
11
        left_binary_terms[i].sign == right_binary_terms[j].sign)
963
11
      {
964
11
        left_binary_terms[i].ignore = 1;
965
11
        right_binary_terms[i].ignore = 1;
966
11
      }
967
11
  // After reduced, we should only have immediate values left, otherwise we cannot progress.
968
11
  int left_val = 0;
969
22
  for (i = 0; i < left_binary_size; 
i++11
)
970
11
    if (!left_binary_terms[i].ignore)
971
0
    {
972
0
      if (left_binary_terms[i].term.type != CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL)
973
0
      {
974
0
        ccfree(left_binary_terms);
975
0
        return 0;
976
0
      } else
977
0
        left_val += left_binary_terms[i].sign == CCV_NNC_MICRO_BINARY_OP_PLUS ? left_binary_terms[i].term.immediate_value : -left_binary_terms[i].term.immediate_value;
978
0
    }
979
11
  int right_val = 0;
980
22
  for (i = 0; i < right_binary_size; 
i++11
)
981
11
    if (!right_binary_terms[i].ignore)
982
0
    {
983
0
      if (right_binary_terms[i].term.type != CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL)
984
0
      {
985
0
        ccfree(left_binary_terms);
986
0
        return 0;
987
0
      } else
988
0
        right_val += right_binary_terms[i].sign == CCV_NNC_MICRO_BINARY_OP_PLUS ? right_binary_terms[i].term.immediate_value : -right_binary_terms[i].term.immediate_value;
989
0
    }
990
11
  ccfree(left_binary_terms);
991
11
  return left_val <= right_val ? 1 : 
-10
;
992
11
}
993
994
static int _ccv_nnc_micro_low_high_bound_from_index(const ccv_nnc_micro_loop_index_term_t index, ccv_nnc_micro_loop_index_term_t* const low_ref, ccv_nnc_micro_loop_index_term_t* const high_ref, const ccv_nnc_micro_loop_t* const loops, const int loop_count)
995
18
{
996
18
  switch (index.type)
997
18
  {
998
0
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_NONE:
999
0
      *low_ref = (ccv_nnc_micro_loop_index_term_t){
1000
0
        .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL,
1001
0
        .immediate_value = 0
1002
0
      };
1003
0
      *high_ref = (ccv_nnc_micro_loop_index_term_t){
1004
0
        .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL,
1005
0
        .immediate_value = 0
1006
0
      };
1007
0
      return 1;
1008
12
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_ID:
1009
12
      if (index.id.type == CCV_NNC_MICRO_LOOP_ID)
1010
12
      {
1011
12
        assert(index.id.id >= 0 && index.id.id < loop_count);
1012
12
        *low_ref = ccv_nnc_micro_loop_index_deep_copy(&loops[index.id.id].start_index);
1013
12
        *high_ref = ccv_nnc_micro_loop_index_deep_copy(&loops[index.id.id].end_index);
1014
12
      } else {
1015
0
        *low_ref = index;
1016
0
        *high_ref = index;
1017
0
      }
1018
12
      return 1;
1019
12
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL:
1020
0
      *low_ref = index;
1021
0
      *high_ref = index;
1022
0
      return 1;
1023
12
    case CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY: {
1024
6
      // Get low, high from both left and right, and then construct new low / high.
1025
6
      ccv_nnc_micro_loop_index_term_t left_low, left_high;
1026
6
      if (!_ccv_nnc_micro_low_high_bound_from_index(index.binary->left, &left_low, &left_high, loops, loop_count))
1027
0
        return 0;
1028
6
      ccv_nnc_micro_loop_index_term_t right_low, right_high;
1029
6
      if (!_ccv_nnc_micro_low_high_bound_from_index(index.binary->right, &right_low, &right_high, loops, loop_count))
1030
0
      {
1031
0
        ccv_nnc_micro_loop_index_free(&left_low);
1032
0
        ccv_nnc_micro_loop_index_free(&left_high);
1033
0
        return 0;
1034
0
      }
1035
6
      // If left is not a range, or right is not a range, it is simple, just copy over.
1036
6
      if (_ccv_nnc_same_index_term(left_low, left_high, 0) || _ccv_nnc_same_index_term(right_low, right_high, 0))
1037
0
      {
1038
0
        *low_ref = (ccv_nnc_micro_loop_index_term_t){
1039
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY,
1040
0
          .binary = (ccv_nnc_micro_loop_index_binary_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_index_binary_t))
1041
0
        };
1042
0
        low_ref->binary->op = index.binary->op;
1043
0
        low_ref->binary->left = left_low;
1044
0
        low_ref->binary->right = right_low;
1045
0
        *high_ref = (ccv_nnc_micro_loop_index_term_t){
1046
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY,
1047
0
          .binary = (ccv_nnc_micro_loop_index_binary_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_index_binary_t))
1048
0
        };
1049
0
        high_ref->binary->op = index.binary->op;
1050
0
        high_ref->binary->left = left_high;
1051
0
        high_ref->binary->right = right_high;
1052
0
        return 1;
1053
0
      }
1054
6
      // Cannot handle -, because lower bound will go to negative, similar for /. Only can handle + and *.
1055
6
      if (!((index.binary->op == CCV_NNC_MICRO_BINARY_OP_PLUS || 
index.binary->op == CCV_NNC_MICRO_BINARY_OP_MUL0
)) ||
1056
6
        // If lower bound is not a non-negative integer, we cannot compute interesting low / high bound, abort.
1057
6
        (left_low.type != CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAL || 
left_low.immediate_value < 00
) ||
1058
6
        
(0
right_low.type != CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAL0
||
right_low.immediate_value < 00
))
1059
6
      {
1060
6
        ccv_nnc_micro_loop_index_free(&left_low);
1061
6
        ccv_nnc_micro_loop_index_free(&left_high);
1062
6
        ccv_nnc_micro_loop_index_free(&right_low);
1063
6
        ccv_nnc_micro_loop_index_free(&right_high);
1064
6
        return 0;
1065
6
      }
1066
0
      *low_ref = (ccv_nnc_micro_loop_index_term_t){
1067
0
        .type = CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAL,
1068
0
        .immediate_value = index.binary->op == CCV_NNC_MICRO_BINARY_OP_PLUS ? left_low.immediate_value + right_low.immediate_value : left_low.immediate_value * right_low.immediate_value,
1069
0
      };
1070
0
      // higher bound is not inclusive, hence, we need to minus extra 1 for this.
1071
0
      if (index.binary->op == CCV_NNC_MICRO_BINARY_OP_PLUS)
1072
0
      {
1073
0
        // (left - 1) + (right - 1) + 1
1074
0
        ccv_nnc_micro_loop_index_term_t sum = {
1075
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY,
1076
0
          .binary = (ccv_nnc_micro_loop_index_binary_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_index_binary_t))
1077
0
        };
1078
0
        sum.binary->op = CCV_NNC_MICRO_BINARY_OP_PLUS;
1079
0
        sum.binary->left = left_high;
1080
0
        sum.binary->right = right_high;
1081
0
        *high_ref = (ccv_nnc_micro_loop_index_term_t){
1082
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY,
1083
0
          .binary = (ccv_nnc_micro_loop_index_binary_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_index_binary_t))
1084
0
        };
1085
0
        high_ref->binary->op = CCV_NNC_MICRO_BINARY_OP_MINUS;
1086
0
        high_ref->binary->left = sum;
1087
0
        high_ref->binary->right = (ccv_nnc_micro_loop_index_term_t){
1088
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL,
1089
0
          .immediate_value = 1
1090
0
        };
1091
0
      } else {
1092
0
        // (left - 1) * (right - 1) + 1
1093
0
        ccv_nnc_micro_loop_index_term_t prod = {
1094
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY,
1095
0
          .binary = (ccv_nnc_micro_loop_index_binary_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_index_binary_t))
1096
0
        };
1097
0
        prod.binary->op = CCV_NNC_MICRO_BINARY_OP_MUL;
1098
0
        prod.binary->left = left_high;
1099
0
        prod.binary->right = right_high;
1100
0
        ccv_nnc_micro_loop_index_term_t minus_left = {
1101
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY,
1102
0
          .binary = (ccv_nnc_micro_loop_index_binary_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_index_binary_t))
1103
0
        };
1104
0
        minus_left.binary->op = CCV_NNC_MICRO_BINARY_OP_MINUS;
1105
0
        minus_left.binary->left = prod;
1106
0
        minus_left.binary->right = left_high;
1107
0
        ccv_nnc_micro_loop_index_term_t minus_right = {
1108
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY,
1109
0
          .binary = (ccv_nnc_micro_loop_index_binary_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_index_binary_t))
1110
0
        };
1111
0
        minus_right.binary->op = CCV_NNC_MICRO_BINARY_OP_MINUS;
1112
0
        minus_right.binary->left = minus_left;
1113
0
        minus_right.binary->right = right_high;
1114
0
        *high_ref = (ccv_nnc_micro_loop_index_term_t){
1115
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY,
1116
0
          .binary = (ccv_nnc_micro_loop_index_binary_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_index_binary_t))
1117
0
        };
1118
0
        high_ref->binary->op = CCV_NNC_MICRO_BINARY_OP_PLUS;
1119
0
        high_ref->binary->left = minus_right;
1120
0
        high_ref->binary->right = (ccv_nnc_micro_loop_index_term_t){
1121
0
          .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL,
1122
0
          .immediate_value = 2
1123
0
        };
1124
0
      }
1125
0
      return 1;
1126
0
    }
1127
0
  }
1128
0
  return 0;
1129
0
}
1130
1131
static void _ccv_nnc_micro_check_bound_for_variable(ccv_nnc_micro_loop_variable_t* const variable, const ccv_nnc_micro_loop_t* const loops, const int loop_count, const ccv_nnc_micro_tensor_t* const vars, const int var_count, const int* const groups)
1132
11
{
1133
11
  if (variable->id.type != CCV_NNC_MICRO_TENSOR_ID)
1134
0
    return;
1135
11
  int i;
1136
11
  assert(variable->id.id >= 0 && variable->id.id < var_count);
1137
11
  ccv_nnc_micro_loop_index_term_t index_zero = {
1138
11
    .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL,
1139
11
    .immediate_value = 0
1140
11
  };
1141
64
  for (i = 0; i < variable->index_count; 
i++53
)
1142
53
  {
1143
53
    ccv_nnc_micro_loop_index_term_t shape = {
1144
53
      .type = CCV_NNC_MICRO_LOOP_INDEX_TYPE_ID,
1145
53
      .id = {
1146
53
        .type = CCV_NNC_MICRO_AXIS_SIZE_ID,
1147
53
        .id = variable->id.id,
1148
53
        .d = i
1149
53
      }
1150
53
    };
1151
53
    switch (variable->index[i].type)
1152
53
    {
1153
38
      case CCV_NNC_MICRO_LOOP_INDEX_TYPE_ID:
1154
38
        // For loop id, we can check the range to see if it is within the shape.
1155
38
        if (variable->index[i].id.type == CCV_NNC_MICRO_LOOP_ID)
1156
38
        {
1157
38
          assert(variable->index[i].id.id >= 0 && variable->index[i].id.id < loop_count);
1158
38
          if (_ccv_nnc_index_less_than_or_equal_to(index_zero, loops[variable->index[i].id.id].start_index, vars, var_count, groups) == 1 &&
1159
38
            (_ccv_nnc_index_less_than_or_equal_to(loops[variable->index[i].id.id].end_index, shape, vars, var_count, groups) == 1 ||
1160
38
             
(30
vars[variable->id.id].shape != 030
&&
_ccv_nnc_index_less_than_or_equal_to(loops[variable->index[i].id.id].end_index, vars[variable->id.id].shape[i], vars, var_count, groups) == 14
)))
1161
11
            variable->no_check_bound[i] = 1;
1162
27
          else
1163
27
            variable->no_check_bound[i] = 0;
1164
38
        } else // If it is anything other than loop id, we have to check the bound.
1165
0
          variable->no_check_bound[i] = 0;
1166
38
        break;
1167
38
      case CCV_NNC_MICRO_LOOP_INDEX_TYPE_BINARY: {
1168
6
        // Compute higher / lower bounds along the expression.
1169
6
        ccv_nnc_micro_loop_index_term_t low, high;
1170
6
        // Cannot find high low, mark no_check_bound[i] = 0
1171
6
        if (!_ccv_nnc_micro_low_high_bound_from_index(variable->index[i], &low, &high, loops, loop_count))
1172
6
        {
1173
6
          variable->no_check_bound[i] = 0;
1174
6
          break;
1175
6
        }
1176
0
        if (_ccv_nnc_index_less_than_or_equal_to(index_zero, low, vars, var_count, groups) == 1 &&
1177
0
          (_ccv_nnc_index_less_than_or_equal_to(high, shape, vars, var_count, groups) == 1 ||
1178
0
           (vars[variable->id.id].shape != 0 && _ccv_nnc_index_less_than_or_equal_to(high, vars[variable->id.id].shape[i], vars, var_count, groups) == 1)))
1179
0
          variable->no_check_bound[i] = 1;
1180
0
        else
1181
0
          variable->no_check_bound[i] = 0;
1182
0
        ccv_nnc_micro_loop_index_free(&low);
1183
0
        ccv_nnc_micro_loop_index_free(&high);
1184
0
        break;
1185
0
      }
1186
9
      case CCV_NNC_MICRO_LOOP_INDEX_TYPE_VAL:
1187
9
        // If the index is an integer, and it is bigger than 0, we need to check bound (there is no assertion the end index is larger than anything other than 0).
1188
9
        if (variable->index[i].immediate_value == 0)
1189
9
          variable->no_check_bound[i] = 1;
1190
0
        else
1191
0
          variable->no_check_bound[i] = 0;
1192
9
        break;
1193
53
    }
1194
53
  }
1195
11
}
1196
1197
static void _ccv_nnc_micro_check_bound_for_expression(ccv_nnc_micro_loop_expression_t* const expression, const ccv_nnc_micro_loop_t* const loops, const int loop_count, const ccv_nnc_micro_tensor_t* const vars, const int var_count, const int* const groups)
1198
12
{
1199
12
  switch (expression->type)
1200
12
  {
1201
6
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAR:
1202
6
      _ccv_nnc_micro_check_bound_for_variable(&expression->variable, loops, loop_count, vars, var_count, groups);
1203
6
      break;
1204
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_TERNAY:
1205
0
      _ccv_nnc_micro_check_bound_for_expression(expression->ternary.pivot, loops, loop_count, vars, var_count, groups);
1206
0
      _ccv_nnc_micro_check_bound_for_expression(expression->ternary.left, loops, loop_count, vars, var_count, groups);
1207
0
      _ccv_nnc_micro_check_bound_for_expression(expression->ternary.right, loops, loop_count, vars, var_count, groups);
1208
0
      break;
1209
3
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_BINARY:
1210
3
      _ccv_nnc_micro_check_bound_for_expression(expression->binary.left, loops, loop_count, vars, var_count, groups);
1211
3
      _ccv_nnc_micro_check_bound_for_expression(expression->binary.right, loops, loop_count, vars, var_count, groups);
1212
3
      break;
1213
0
    case CCV_NNC_MICRO_LOOP_EXPR_TYPE_UNARY:
1214
0
      _ccv_nnc_micro_check_bound_for_expression(expression->unary.x, loops, loop_count, vars, var_count, groups);
1215
0
      break;
1216
12
  }
1217
12
}
1218
1219
static void _ccv_nnc_micro_check_bound_for_block(ccv_nnc_micro_loop_block_t* const block, const ccv_nnc_micro_tensor_t* const vars, const int var_count, const int* const groups)
1220
4
{
1221
4
  int i, j;
1222
26
  for (i = 0; i < block->loop_count; 
i++22
)
1223
22
  {
1224
22
    const int statement_count = block->loops[i].statement_count;
1225
22
    ccv_nnc_micro_loop_statement_t* const statements = block->loops[i].statements;
1226
28
    for (j = 0; j < statement_count; 
j++6
)
1227
6
    {
1228
6
      switch (statements[j].type)
1229
6
      {
1230
3
        case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_ASSIGNMENT:
1231
3
          _ccv_nnc_micro_check_bound_for_variable(&statements[j].assignment.lvalue, block->loops, block->loop_count, vars, var_count, groups);
1232
3
          _ccv_nnc_micro_check_bound_for_expression(&statements[j].assignment.rvalue, block->loops, block->loop_count, vars, var_count, groups);
1233
3
          break;
1234
3
        case CCV_NNC_MICRO_LOOP_STATEMENT_TYPE_COMPOUND_ASSIGNMENT:
1235
3
          if (statements[j].compound_assignment.lvalue.type == CCV_NNC_MICRO_LOOP_EXPR_TYPE_VAR)
1236
2
            _ccv_nnc_micro_check_bound_for_variable(&statements[j].compound_assignment.lvalue.variable, block->loops, block->loop_count, vars, var_count, groups);
1237
3
          _ccv_nnc_micro_check_bound_for_expression(&statements[j].compound_assignment.rvalue, block->loops, block->loop_count, vars, var_count, groups);
1238
3
          break;
1239
6
      }
1240
6
    }
1241
22
  }
1242
4
}
1243
1244
void ccv_nnc_micro_program_simplify(ccv_nnc_micro_program_t* const program, const ccv_nnc_micro_io_t* const inputs, const int input_size, const ccv_nnc_micro_io_t* const outputs, const int output_size)
1245
2
{
1246
2
  // Nothing to simplify for.
1247
2
  if (program->function_count < 1)
1248
0
    return;
1249
2
  // Only one block, nothing to simplify for.
1250
2
  if (program->function_count == 1 && 
program->functions[0].block_count == 10
)
1251
0
    return;
1252
2
  if (input_size == 0 || output_size == 0)
1253
0
    return;
1254
2
  // Union-find to group all variables with the same shape.
1255
2
  ccv_nnc_micro_tensor_t* const vars = program->vars;
1256
2
  const int var_count = program->var_count;
1257
2
  int* const groups = (int*)ccmalloc(sizeof(int) * var_count);
1258
2
  int i, j;
1259
20
  for (i = 0; i < var_count; 
i++18
)
1260
18
    groups[i] = i;
1261
2
  // If no shape, they should match these input.
1262
20
  for (i = 0; i < var_count; 
i++18
)
1263
18
    if (vars[i].input >= 0 && 
!vars[i].shape14
)
1264
8
    {
1265
8
      int root = vars[i].input;
1266
9
      while (groups[root] != root)
1267
1
        root = groups[root];
1268
8
      groups[i] = root;
1269
8
    }
1270
20
  for (i = 0; i < var_count; 
i++18
)
1271
18
  {
1272
18
    // If this is input (no other tensor as the input), we skip.
1273
18
    if (vars[i].input < 0)
1274
4
      continue;
1275
14
    int root = i;
1276
22
    while (groups[root] != root)
1277
8
      root = groups[root];
1278
14
    // If the sibling exists and we haven't visited yet, mark them has the same group as us.
1279
14
    if (vars[i].sibling >= 0 && 
vars[i].sibling < i8
&&
groups[vars[i].sibling] < 08
)
1280
0
      groups[vars[i].sibling] = root;
1281
14
  }
1282
18
  for (i = var_count - 1; i > 0; 
i--16
)
1283
16
  {
1284
16
    // Now matching the shape.
1285
16
    if (vars[i].input < 0 || 
!vars[i].shape14
)
1286
10
      continue;
1287
6
    int root = i;
1288
8
    while (groups[root] != root)
1289
2
      root = groups[root];
1290
26
    for (j = i - 1; j >= 0; 
j--20
)
1291
20
      if (vars[j].shape && 
vars[j].dimensions == vars[i].dimensions6
&&
1292
20
        
_ccv_nnc_same_shape(vars[j].shape, vars[i].shape, vars[i].dimensions)6
)
1293
2
        groups[j] = root;
1294
6
  }
1295
2
  // First, flat out all functions into blocks.
1296
2
  ccv_array_t* const blocks = ccv_array_new(sizeof(ccv_nnc_micro_loop_block_t), 0, 0);
1297
2
  ccv_nnc_micro_function_t* const functions = program->functions;
1298
2
  const int function_count = program->function_count;
1299
2
  int max_loop_count = 0;
1300
14
  for (i = 0; i < function_count; 
i++12
)
1301
12
  {
1302
12
    const int block_count = functions[i].block_count;
1303
12
    ccv_nnc_micro_loop_block_t* const function_blocks = block_count == 1 ? 
&functions[i].one_block9
:
functions[i].blocks3
;
1304
27
    for (j = 0; j < block_count; 
j++15
)
1305
15
    {
1306
15
      max_loop_count = ccv_max(function_blocks[j].loop_count, max_loop_count);
1307
15
      ccv_array_push(blocks, &function_blocks[j]);
1308
15
    }
1309
12
  }
1310
2
  // Next, find dependencies between these function blocks and marking these that are dependencies for the final outputs.
1311
2
  // We need to build our connections between blocks <-> r/w vars.
1312
2
  ccv_nnc_micro_loop_block_dependency_t* block_dependencies;
1313
2
  ccv_nnc_micro_tensor_dependency_t* tensor_dependencies;
1314
2
  const int block_size = blocks->rnum;
1315
2
  _ccv_nnc_micro_block_dependencies((ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, 0), block_size, var_count, &block_dependencies, &tensor_dependencies);
1316
2
  ccv_array_t* const in_use = ccv_array_new(sizeof(int), output_size, 0);
1317
2
  // Use the dependencies to mark blocks / vars that are in use.
1318
5
  for (i = 0; i < output_size; 
i++3
)
1319
3
  {
1320
3
    tensor_dependencies[outputs[i]->id].flag = 1; // Mark them as in use.
1321
3
    ccv_array_push(in_use, &outputs[i]->id);
1322
3
  }
1323
7
  for (i = 0; i < input_size; 
i++5
)
1324
5
    tensor_dependencies[inputs[i]->id].flag = 1; // Mark inputs as in use so we don't go pass them.
1325
13
  for (i = 0; i < in_use->rnum; 
i++11
)
1326
11
  {
1327
11
    const int tensor_idx = *(int*)ccv_array_get(in_use, i);
1328
11
    if (tensor_dependencies[tensor_idx].writes)
1329
24
      
for (j = 0; 11
j < tensor_dependencies[tensor_idx].writes->rnum;
j++13
)
1330
13
      {
1331
13
        const int block_idx = *(int*)ccv_array_get(tensor_dependencies[tensor_idx].writes, j);
1332
13
        block_dependencies[block_idx].flag = 1;
1333
13
        int k;
1334
13
        if (block_dependencies[block_idx].reads)
1335
27
          
for (k = 0; 11
k < block_dependencies[block_idx].reads->rnum;
k++16
)
1336
16
          {
1337
16
            const int read_idx = *(int*)ccv_array_get(block_dependencies[block_idx].reads, k);
1338
16
            if (!tensor_dependencies[read_idx].flag)
1339
8
            {
1340
8
              tensor_dependencies[read_idx].flag = 1;
1341
8
              ccv_array_push(in_use, &read_idx);
1342
8
            }
1343
16
          }
1344
13
      }
1345
11
  }
1346
2
  ccv_array_free(in_use);
1347
17
  for (i = 0; i < block_size; 
i++15
)
1348
15
    if (!block_dependencies[i].flag)
1349
2
    {
1350
2
      ccv_nnc_micro_loop_block_t* const block = (ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, i);
1351
2
      ccv_nnc_micro_loops_free(block->loops, block->loop_count);
1352
2
      ccfree(block->loops);
1353
2
      block->loops = 0;
1354
2
      block->loop_count = 0;
1355
2
    }
1356
20
  for (i = 0; i < var_count; 
i++18
)
1357
18
    if (!tensor_dependencies[i].flag) // If this tensor is not visited, there is no need to alloc.
1358
2
    {
1359
2
      _ccv_nnc_tensor_remove_dead_store(&tensor_dependencies[i], i, blocks);
1360
2
      vars[i].no_alloc = 1;
1361
2
    }
1362
2
  _ccv_nnc_loop_merging(block_dependencies, tensor_dependencies, blocks, max_loop_count, groups);
1363
2
  _ccv_nnc_micro_dependencies_free(block_dependencies, block_size, tensor_dependencies, var_count);
1364
2
  // Culling out empty blocks.
1365
17
  for (i = 0, j = 0; i < blocks->rnum; 
i++15
)
1366
15
  {
1367
15
    const ccv_nnc_micro_loop_block_t* const block = (ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, i);
1368
15
    if (block->loop_count > 0)
1369
4
    {
1370
4
      *(ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, j) = *block;
1371
4
      ++j;
1372
4
    }
1373
15
  }
1374
2
  // Now we moved everything, set the proper block size.
1375
2
  ccv_array_resize(blocks, j);
1376
2
  // Substitute variables.
1377
2
  _ccv_nnc_var_subst(vars, var_count, inputs, input_size, outputs, output_size, blocks, groups);
1378
2
  // Mark whether we need to check bound for a particular variable or not.
1379
6
  for (i = 0; i < blocks->rnum; 
i++4
)
1380
4
  {
1381
4
    ccv_nnc_micro_loop_block_t* const block = (ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, i);
1382
4
    _ccv_nnc_micro_check_bound_for_block(block, vars, var_count, groups);
1383
4
  }
1384
2
  free(groups);
1385
2
  // Reallocate function to be 1.
1386
14
  for (i = 0; i < function_count; 
i++12
)
1387
12
    if (functions[i].block_count > 1)
1388
12
      
ccfree3
(functions[i].blocks)3
;
1389
2
  program->functions = (ccv_nnc_micro_function_t*)ccrealloc(program->functions, sizeof(ccv_nnc_micro_function_t));
1390
2
  program->functions[0].block_count = blocks->rnum;
1391
2
  if (blocks->rnum > 1)
1392
1
  {
1393
1
    program->functions[0].blocks = (ccv_nnc_micro_loop_block_t*)ccmalloc(sizeof(ccv_nnc_micro_loop_block_t) * blocks->rnum);
1394
1
    memcpy(program->functions[0].blocks, ccv_array_get(blocks, 0), sizeof(ccv_nnc_micro_loop_block_t) * blocks->rnum);
1395
1
  } else
1396
1
    program->functions[0].one_block = *(ccv_nnc_micro_loop_block_t*)ccv_array_get(blocks, 0);
1397
2
  program->function_count = 1;
1398
2
  ccv_array_free(blocks);
1399
2
}