Coverage Report

Created: 2017-11-12 13:27

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_symbolic_graph_compile.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#ifdef HAVE_CUDA
6
#include "gpu/ccv_nnc_compat.h"
7
#endif
8
#include "_ccv_nnc_graph.h"
9
#include "_ccv_nnc_symbolic_graph.h"
10
11
typedef struct {
12
  int flags;
13
  int type;
14
  int ref; // Reference to another tensor block. Start with 1.
15
  int graph_ref; // Reference to a particular graph. Start with 1.
16
  int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
17
  ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
18
  uint64_t size; // The size of the tensor expected.
19
  int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
20
  int dup_p_ref; // Reference to the parent tensor block from the duplicated tensor blocks. It will only be one, for the output. Start with 1.
21
  ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
22
  ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
23
} ccv_nnc_tensor_block_t; // Tensor Arena Block
24
25
1.04k
#define IS_PRIMARY_COMPANION(idx, block) ((idx) < (uint32_t)((block).companion_ref - 1))
26
27
enum {
28
  UNASSIGNED = 0x1,
29
  ALIAS = 0x2,
30
  CONST_TENSOR = 0x3,
31
  READ_ONLY = 0x4,
32
  WRITE_ONLY = 0x8,
33
  READ_WRITE = 0xc,
34
  ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
35
};
36
37
#define TENSOR_EXPECT_ORDINARY(t) ((t.flags & 0x3) == 0)
38
#define TENSOR_EXPECT_SET_ORDINARY(t) (t.flags = (t.flags & ~0x3))
39
37.4k
#define TENSOR_EXPECT_UNASSIGNED(t) ((t.flags & 0x3) == UNASSIGNED)
40
48
#define TENSOR_EXPECT_SET_UNASSIGNED(t) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
41
41.6k
#define TENSOR_EXPECT_ALIAS(t) ((t.flags & 0x3) == ALIAS)
42
4.25k
#define TENSOR_EXPECT_CONST(t) ((t.flags & 0x3) == CONST_TENSOR)
43
40.1k
#define TENSOR_EXPECT_COMPUTABLE(t) 
(!40.1k
TENSOR_EXPECT_ALIAS40.1k
(t) &&
!35.1k
TENSOR_EXPECT_UNASSIGNED35.1k
(t))
44
281
#define TENSOR_READ_WRITE(t) (t.flags & 0xc)
45
72
#define TENSOR_SET_READ_WRITE(t, rw) (t.flags = ((t.flags & ~0xc) | rw))
46
24
#define TENSOR_SET_ANONYMOUS(t) (t.flags = (t.flags & ~0x10 | ANONYMOUS))
47
#define TENSOR_IS_ANONYMOUS(t) (t.flags & ANONYMOUS)
48
49
typedef struct {
50
  int index;
51
  int companion; // The companion node index (the node that doesn't interfere with current one).
52
  int oc;
53
  uint64_t size;
54
} ccv_nnc_tensor_opt_t;
55
56
2.78k
#define more_than(i1, i2, aux) 
(((i1).size > (i2).size) || 2.78k
((i1).size == (i2).size && 1.81k
(i1).oc >= (i2).oc1.16k
))
57
2.78k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)
58
#undef more_than
59
60
// If every a's head is deterministically after b's tail
61
static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
62
12.7k
{
63
12.7k
  assert(a.head);
64
12.7k
  assert(b.tail);
65
12.7k
  int x, y;
66
13.9k
  for (x = 0; 
x < a.head->rnum13.9k
;
x++1.14k
)
67
14.0k
    
for (y = 0; 12.7k
y < b.tail->rnum14.0k
;
y++1.22k
)
68
12.8k
    {
69
12.8k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)
ccv_array_get12.8k
(a.head, x), *(int*)
ccv_array_get12.8k
(b.tail, y));
70
12.8k
      if (
!cell.i32 || 12.8k
cell.i32[0] == 01.22k
)
71
11.6k
        return 0;
72
12.8k
    }
73
12.7k
  // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b's tail now.
74
1.12k
  
return (a.head->rnum > 0 && 1.12k
b.tail->rnum > 01.12k
);
75
12.7k
}
76
77
typedef struct {
78
  ccv_array_t** alloc_dep;
79
  int vt_block_size;
80
  int buffer_size;
81
  int block_size;
82
  int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
83
  struct {
84
    uint64_t size; // The size of the buffer allocated.
85
    int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
86
    int dup_p_ref; // Reference to the parent tensor block from the duplicated tensor blocks. It will only be one, for the output. Start with 1.
87
    int type; // The type from tensor blocks.
88
    int flags; // The flags (currently for READ_ONLY or not).
89
  }* buffers;
90
  struct {
91
    int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
92
    int block_ref; // A reference to which block in the given tensor_block to use.
93
    uint64_t offset; // The offset of this block.
94
  }* blocks;
95
} ccv_nnc_tensor_alloc_prep_t;
96
97
typedef struct ccv_nnc_symbolic_graph_prep_s {
98
  intptr_t graph_ref;
99
  int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
100
  int exec_idx;
101
  int nth_unroll; // How many times this graph is unrolled before we can have proper assignment.
102
  int tensor_symbol_info_size;
103
  int exec_symbol_info_size;
104
  int tensor_block_size;
105
  int sub_prep_size;
106
  ccv_nnc_tensor_block_t* tensor_blocks;
107
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
108
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
109
  int* dup_tensor_block_ref;
110
  ccv_nnc_graph_visit_t* visit;
111
  ccv_nnc_tensor_alloc_prep_t* alloc_prep;
112
  struct ccv_nnc_symbolic_graph_prep_s* p;
113
  struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
114
  // Structures that don't require to be freed after deallocation.
115
  ccv_nnc_graph_t* graph; // materialized graph, not managed by prep.
116
  ccv_nnc_tensor_arena_t* tensor_arena; // tensor arena, not managed by prep as well.
117
} ccv_nnc_symbolic_graph_prep_t;
118
119
static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
120
29
{
121
29
  // Compute how many dis-continuous buffers are needed.
122
29
  // We prefer to have several dis-continuous buffers instead of one big buffer because
123
29
  // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
124
29
  // to fully utilize memory.
125
29
  int i, j, k;
126
29
  ccv_array_t** alloc_dep = (ccv_array_t**)cccalloc(tensor_block_size, sizeof(ccv_array_t*));
127
29
  int allocable_tensor_size = 0, available_tensor_size = 0;
128
351
  for (i = 0; 
i < tensor_block_size351
;
i++322
)
129
322
    
if (322
!322
TENSOR_EXPECT_UNASSIGNED322
(tensor_blocks[i]))
130
270
    {
131
270
      // Tensors that we need the header info.
132
270
      ++available_tensor_size;
133
270
      if (
!270
TENSOR_EXPECT_ALIAS270
(tensor_blocks[i]))
134
270
        // Tensors that we actually need to allocate (exclude the alias).
135
216
        ++allocable_tensor_size;
136
270
    }
137
29
  ccv_sparse_matrix_t* tensor_itf = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_8U | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
138
29
  // Overlap count.
139
351
  for (i = 0; 
i < tensor_block_size351
;
i++322
)
140
4.36k
    
for (j = i + 1; 322
j < tensor_block_size4.36k
;
j++4.04k
)
141
4.04k
      
if (4.04k
TENSOR_EXPECT_COMPUTABLE4.04k
(tensor_blocks[i]) && 4.04k
TENSOR_EXPECT_COMPUTABLE2.80k
(tensor_blocks[j]))
142
1.86k
      {
143
1.86k
        // If either of the tensor is const, it must interfere with each other.
144
1.86k
        const uint8_t one = 1;
145
1.86k
        if (
TENSOR_EXPECT_CONST1.86k
(tensor_blocks[i]) || 1.86k
TENSOR_EXPECT_CONST1.86k
(tensor_blocks[j]))
146
0
          ccv_set_sparse_matrix_cell(tensor_itf, i, j, &one);
147
1.86k
        else {
148
1.86k
          // Otherwise, check to see if they interfere (default to yes).
149
1.86k
          // If any of the i's head is deterministically later than j's tail
150
1.86k
          // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
151
1.86k
          int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
152
1.86k
          int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
153
1.86k
          // It cannot be that both i can hop to j can j can hop to i.
154
1.86k
          assert(!(i_hop_j > 0 && j_hop_i > 0));
155
1.86k
          if (
!i_hop_j && 1.86k
!j_hop_i1.86k
)
156
1.35k
            ccv_set_sparse_matrix_cell(tensor_itf, i, j, &one);
157
1.86k
        }
158
1.86k
      }
159
29
  int* oc = (int*)cccalloc(tensor_block_size, sizeof(int));
160
351
  for (i = 0; 
i < tensor_block_size351
;
i++322
)
161
8.73k
    
for (j = 0; 322
j < tensor_block_size8.73k
;
j++8.41k
)
162
322
      // If these two tensors are still alive, analyze them.
163
8.41k
      
if (8.41k
i != j && 8.41k
TENSOR_EXPECT_COMPUTABLE8.09k
(tensor_blocks[i]) &&
TENSOR_EXPECT_COMPUTABLE5.44k
(tensor_blocks[j]))
164
3.73k
      {
165
3.73k
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, 
ccv_min3.73k
(i, j),
ccv_max3.73k
(i, j));
166
3.73k
        // If their life time overlaps, compute how many tensors it overlap.
167
3.73k
        if (
cell.u8 && 3.73k
cell.u8[0] == 12.70k
)
168
2.70k
          ++oc[i];
169
3.73k
      }
170
29
  int* assigned = (int*)cccalloc(tensor_block_size, sizeof(int));
171
29
  uint64_t* allocated_offset = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
172
29
  uint64_t* allocated_size = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
173
29
  int num_assigned = 0; 
174
29
  // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
175
29
  // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
176
29
  // The first channel denotes the bytes available for allocation,
177
29
  // the second channel denotes the offset available for the allocation,
178
29
  ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
179
29
  ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
180
228
  for (j = 0; j < allocable_tensor_size;)
181
199
  {
182
199
    // Find the one with largest overlap, and it is not assigned.
183
199
    int max_oc = 0;
184
199
    ccv_array_clear(opt);
185
5.08k
    for (i = 0; 
i < tensor_block_size5.08k
;
i++4.88k
)
186
4.88k
      
if (4.88k
oc[i] >= max_oc && 4.88k
TENSOR_EXPECT_COMPUTABLE2.84k
(tensor_blocks[i]) &&
!assigned[i]2.49k
&&
IS_PRIMARY_COMPANION1.04k
(i, tensor_blocks[i]))
187
1.04k
      {
188
1.04k
        ccv_nnc_tensor_opt_t a = {
189
1.04k
          .size = tensor_blocks[i].size,
190
1.04k
          .index = i,
191
1.04k
          .companion = -1, // If already have a designated companion, use that.
192
1.04k
          .oc = oc[i],
193
1.04k
        };
194
1.04k
        if (tensor_blocks[i].companion_ref)
195
7
        {
196
7
          const int companion_ref = tensor_blocks[i].companion_ref - 1;
197
7
          a.size = ccv_max(a.size, tensor_blocks[companion_ref].size);
198
7
          a.oc += oc[companion_ref];
199
7
        }
200
1.04k
        // In case we have a tie, take them all in the array.
201
1.04k
        if (a.oc > max_oc)
202
390
          ccv_array_clear(opt), max_oc = a.oc;
203
1.04k
        ccv_array_push(opt, &a);
204
1.04k
      }
205
199
    assert(opt->rnum > 0);
206
199
    // Go through opt array, find all tensors that doesn't interfere with it, and have tensor size larger than it.
207
199
    // Push them with the "companion" into the opt array as well.
208
199
    const int rnum = opt->rnum;
209
737
    for (i = 0; 
i < rnum737
;
i++538
)
210
538
    {
211
538
      // Copy it out, because after insertion, it may hold invalid pointer.
212
538
      ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
213
538
      assert(a.companion == -1);
214
538
      const int companion_ref = tensor_blocks[i].companion_ref - 1;
215
13.7k
      for (k = 0; 
k < tensor_block_size13.7k
;
k++13.2k
)
216
538
        // Find non-overlapping tensor that has larger size (of course, is unassigned and is not one with designated companion).
217
13.2k
        
if (13.2k
k != a.index && 13.2k
!tensor_blocks[k].companion_ref12.6k
&&
TENSOR_EXPECT_COMPUTABLE12.5k
(tensor_blocks[k]) &&
!assigned[k]8.12k
&&
tensor_blocks[k].size > a.size4.16k
)
218
1.12k
        {
219
1.12k
          ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, 
ccv_min1.12k
(a.index, k),
ccv_max1.12k
(a.index, k));
220
1.12k
          // Good, push to opt array.
221
1.12k
          if (
cell.u8 && 1.12k
cell.u8[0] == 1766
)
222
766
            continue;
223
360
          
if (360
companion_ref >= 0360
)
224
0
          {
225
0
            assert(companion_ref != k);
226
0
            // Have to make sure k doesn't interfere with the designated companion as well.
227
0
            ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, 
ccv_min0
(companion_ref, k),
ccv_max0
(companion_ref, k));
228
0
            if (
cell.u8 && 0
cell.u8[0] == 10
)
229
0
              continue;
230
0
          }
231
360
          ccv_nnc_tensor_opt_t b = a;
232
360
          b.companion = k;
233
360
          b.oc = a.oc + oc[k];
234
360
          b.size = tensor_blocks[k].size;
235
360
          ccv_array_push(opt, &b);
236
360
        }
237
538
    }
238
199
    // Order opt array by the size.
239
199
    _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
240
199
    // Assuming all tensors has the same data format (32F), therefore, we only need to consider the dimensional size.
241
199
    // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
242
199
    int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep->rows * 3;
243
199
    uint64_t min_val[2] = {
244
199
      0, 0
245
199
    };
246
865
    for (i = 0; 
i < opt->rnum865
;
i++666
)
247
705
    {
248
705
      ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
249
705
      // Now, determine the order between a and c. After this, we can always check whether y
250
705
      // can hop to the earliest one and if the latest one can hop to x.
251
705
      // The earliest one will be called p and the latest one will be called q.
252
705
      int p = a.index;
253
705
      int q = a.index;
254
705
      if (a.companion >= 0)
255
219
      {
256
219
        const int a_hop_c = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a.companion], tensor_blocks[a.index]);
257
219
        const int c_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a.index], tensor_blocks[a.companion]);
258
219
        assert((a_hop_c > 0 && c_hop_a == 0) || (a_hop_c == 0 && c_hop_a > 0));
259
219
        if (a_hop_c > 0)
260
179
          q = a.companion;
261
219
        else
262
40
          p = a.companion;
263
219
      }
264
705
      if (tensor_blocks[a.index].companion_ref)
265
7
      {
266
7
        const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
267
7
        assert(a.companion != companion_ref);
268
7
        const int b_hop_p = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[p], tensor_blocks[companion_ref]);
269
7
        if (b_hop_p > 0)
270
1
          p = companion_ref;
271
6
        else {
272
6
          const int q_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[companion_ref], tensor_blocks[q]);
273
6
          if (q_hop_b > 0)
274
6
            q = companion_ref;
275
0
          else { // Otherwise, b is in between p and q.
276
0
            const int p_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[companion_ref], tensor_blocks[p]);
277
0
            const int b_hop_q = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[q], tensor_blocks[companion_ref]);
278
0
            assert(p_hop_b > 0 && b_hop_q > 0);
279
0
          }
280
6
        }
281
7
      }
282
17.4k
#define for_block(y, x, val) 
do 17.4k
{ \17.4k
283
17.4k
        /* y is always earlier than x, but this is hard to assert now. */ \
284
17.4k
        /* If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds. */ \
285
17.4k
        /* Thus, the hop between y and x (through a) should be smallest ones. */ \
286
17.4k
        if (((uint64_t*)val)[0] >= a.size) \
287
7.89k
        { \
288
4.17k
          int y_hop_p = (y == 0) ? 
exec_dep->rows3.71k
:
_ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[p], tensor_blocks[y - 1])4.17k
; \
289
4.33k
          int q_hop_x = (x == tensor_block_size + 1) ? 
exec_dep->rows3.55k
:
_ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[x - 1], tensor_blocks[q])4.33k
; \
290
7.89k
          int hop = y_hop_p + q_hop_x; \
291
7.89k
          /* a.index doesn't overlap with y and x (in between) */ \
292
7.89k
          if (
(y == 0 || 7.89k
y_hop_p4.17k
) &&
(x == tensor_block_size + 1 || 3.86k
q_hop_x3.81k
) &&
hop < min_hop104
) \
293
39
            min_y = y, min_x = x, min_hop = hop, \
294
39
            min_val[0] = ((uint64_t*)val)[0], min_val[1] = ((uint64_t*)val)[1]; \
295
7.89k
        } \
296
17.4k
      } while (0)
297
17.4k
      
CCV_SPARSE_FOREACH705
(alloc,
for_block17.4k
);
298
705
#undef for_block
299
705
      // If I found a place, stop, and exit.
300
705
      
if (705
min_y > 0 || 705
min_x < tensor_block_size + 1694
)
301
39
      {
302
39
        min_i = i;
303
39
        break;
304
39
      }
305
705
    }
306
199
    // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
307
199
    // and default to largest size available.
308
199
    
ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)199
ccv_array_get199
(opt, ccv_max(0, min_i));
309
199
    if (min_i == -1)
310
160
    {
311
160
      allocated_size[num_assigned] = a.size;
312
160
      ++num_assigned;
313
160
    }
314
199
    int assign_group = num_assigned;
315
199
    if (min_y > 0)
316
11
    {
317
11
      assign_group = assigned[min_y - 1];
318
11
      // The y and x should belong to the same assigned group.
319
11
      assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group);
320
188
    } else 
if (188
min_x < tensor_block_size + 1188
)
321
28
      assign_group = assigned[min_x - 1];
322
199
    // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
323
199
    if (
min_y != 0 || 199
min_x != tensor_block_size + 1188
)
324
39
    {
325
39
      uint64_t val[2] = {
326
39
        min_val[0], min_val[1]
327
39
      };
328
39
      assert(val[0] >= a.size);
329
39
      val[0] -= a.size;
330
39
      val[1] = val[1] + a.size; // Move the offset to the next one.
331
39
      ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
332
39
    }
333
199
    int strings[3];
334
199
    strings[0] = a.index + 1;
335
199
    int string_size = 1;
336
199
    // Assign out companion as well.
337
199
    if (a.companion >= 0)
338
10
    {
339
10
      const int a_hop_c = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a.companion], tensor_blocks[a.index]);
340
10
      if (a_hop_c > 0)
341
4
        strings[1] = a.companion + 1;
342
6
      else {
343
6
        strings[1] = strings[0];
344
6
        strings[0] = a.companion + 1;
345
6
      }
346
10
      ++string_size;
347
10
    }
348
199
    // Assign out designated companion if it exist.
349
199
    if (
tensor_blocks[a.index].companion_ref && 199
a.companion != tensor_blocks[a.index].companion_ref - 17
)
350
7
    {
351
7
      const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
352
7
      const int b_hop_p = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[strings[0] - 1], tensor_blocks[companion_ref]);
353
7
      if (b_hop_p > 0)
354
1
      {
355
2
        for (i = 0; 
i < string_size2
;
i++1
)
356
1
          strings[i + 1] = strings[i];
357
1
        strings[0] = companion_ref + 1;
358
6
      } else {
359
6
        const int q_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[companion_ref], tensor_blocks[strings[string_size - 1] - 1]);
360
6
        if (q_hop_b > 0)
361
6
          strings[string_size] = companion_ref + 1;
362
0
        else {
363
0
          // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
364
0
          assert(string_size == 2);
365
0
          strings[2] = strings[1];
366
0
          strings[1] = companion_ref + 1;
367
0
        }
368
6
      }
369
7
      ++string_size;
370
7
    }
371
199
    // Assign out and update oc.
372
415
    for (i = 0; 
i < string_size415
;
i++216
)
373
216
    {
374
216
      const int index = strings[i] - 1;
375
216
      // Assign out the selected one.
376
216
      assigned[index] = assign_group;
377
216
      // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
378
216
      allocated_offset[index] = min_val[1];
379
5.87k
      for (k = 0; 
k < tensor_block_size5.87k
;
k++5.65k
)
380
5.65k
        
if (5.65k
!assigned[k] && 5.65k
TENSOR_EXPECT_COMPUTABLE3.57k
(tensor_blocks[k]))
381
1.86k
        {
382
1.86k
          ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, 
ccv_min1.86k
(k, index),
ccv_max1.86k
(k, index));
383
1.86k
          if (
cell.u8 && 1.86k
cell.u8[0] == 11.35k
)
384
1.35k
            --oc[k];
385
1.86k
        }
386
216
    }
387
199
    uint64_t val[2] = {
388
199
      a.size, min_val[1]
389
199
    };
390
199
    uint64_t consumed_size = 0;
391
199
    // Go over from min_y to string_size (excluding min_x).
392
203
    for (i = 0; 
i < string_size203
;
i++4
)
393
203
    {
394
203
      const uint64_t size = tensor_blocks[strings[i] - 1].size;
395
203
      assert(size <= a.size);
396
203
      // Update consumed size if it is bigger than "size".
397
203
      if (size > consumed_size)
398
203
      {
399
203
        val[0] = size - consumed_size;
400
203
        ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
401
203
        consumed_size = size;
402
203
        val[1] = min_val[1] + consumed_size;
403
203
      }
404
203
      // If it consumed all the flow, break out.
405
203
      if (consumed_size == a.size)
406
199
        break;
407
203
    }
408
415
    for (i = 0; 
i < string_size415
;
i++216
)
409
216
    {
410
216
      const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
411
216
      uint64_t val[2] = {
412
216
        i_size, min_val[1]
413
216
      };
414
216
      uint64_t consumed_size = 0;
415
222
      for (k = i + 1; 
k < string_size222
;
k++6
)
416
17
      {
417
17
        const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size);
418
17
        // Update consumed size if it is bigger than "size".
419
17
        if (size > consumed_size)
420
17
        {
421
17
          val[0] = size - consumed_size;
422
17
          ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
423
17
          consumed_size = size;
424
17
          val[1] = min_val[1] + consumed_size;
425
17
        }
426
17
        // If it consumed all the flow, break out.
427
17
        if (consumed_size == i_size)
428
11
          break;
429
17
      }
430
216
      val[0] = i_size - consumed_size;
431
216
      // Still have residual, flow it to min_x.
432
216
      if (val[0] > 0)
433
205
        ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
434
216
    }
435
199
    j += string_size;
436
199
  }
437
29
  ccv_array_free(opt);
438
29
  ccv_matrix_free(tensor_itf);
439
425
#define for_block(y, x, val) 
do 425
{ \425
440
425
    if (
((uint64_t*)val)[0] > 0 && 425
y > 0411
&&
x < tensor_block_size + 1231
) \
441
57
    { \
442
57
      if (!alloc_dep[x - 1]) \
443
40
        alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
444
57
      ccv_array_replace_int(alloc_dep[x - 1], y - 1, y - 1); \
445
57
    } \
446
425
  } while (0)
447
425
  
CCV_SPARSE_FOREACH29
(alloc,
for_block425
);
448
29
#undef for_block
449
29
  ccv_matrix_free(alloc);
450
29
  ccfree(oc);
451
29
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
452
29
  alloc_prep->alloc_dep = alloc_dep;
453
29
  alloc_prep->vt_block_size = tensor_block_size;
454
29
  alloc_prep->buffer_size = num_assigned;
455
29
  alloc_prep->block_size = available_tensor_size;
456
29
  alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
457
29
  alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
458
29
  alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
459
29
  memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
460
189
  for (i = 0; 
i < num_assigned189
;
i++160
)
461
160
    alloc_prep->buffers[i].size = allocated_size[i];
462
29
  ccfree(allocated_size);
463
29
  j = 0;
464
29
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
465
351
  for (i = 0; 
i < tensor_block_size351
;
i++322
)
466
322
    
if (322
!322
TENSOR_EXPECT_UNASSIGNED322
(tensor_blocks[i]))
467
270
    {
468
270
      alloc_prep->blocks[j].block_ref = i;
469
270
      if (
!270
TENSOR_EXPECT_ALIAS270
(tensor_blocks[i]))
470
216
      {
471
216
        alloc_prep->vt_blocks[i] = j;
472
216
        // Also, set its allocations.
473
216
        assert(assigned[i] > 0);
474
216
        const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
475
216
        alloc_prep->blocks[j].offset = allocated_offset[i];
476
216
        if (!alloc_prep->buffers[buffer_ref].type)
477
160
          alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
478
216
        alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i]);
479
216
        assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size);
480
54
      } else {
481
54
        alloc_prep->vt_blocks[i] = -1;
482
54
        alloc_prep->blocks[j].buffer_ref = -1;
483
54
        alloc_prep->blocks[j].offset = 0;
484
54
      }
485
270
      ++j;
486
270
    } else
487
52
      alloc_prep->vt_blocks[i] = -1;
488
29
  ccfree(allocated_offset);
489
29
  ccfree(assigned);
490
29
  return alloc_prep;
491
29
}
492
493
static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
494
29
{
495
29
  int i;
496
351
  for (i = 0; 
i < alloc_prep->vt_block_size351
;
i++322
)
497
322
    
if (322
alloc_prep->alloc_dep[i]322
)
498
40
      ccv_array_free(alloc_prep->alloc_dep[i]);
499
29
  ccfree(alloc_prep->alloc_dep);
500
29
  ccfree(alloc_prep);
501
29
}
502
503
// Simple allocator from ccv_array_t.
504
static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
505
251
{
506
251
  int pos = tensor_metadata->rnum;
507
251
  int rsize = (size + 15) / 16;
508
251
  ccv_array_resize(tensor_metadata, pos + rsize);
509
251
  return (pos << 1) + 1;
510
251
}
511
512
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
513
707
{
514
707
  assert((pos >> 1) <= tensor_metadata->rnum);
515
707
  return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1);
516
707
}
517
518
641
#define CCV_NNC_IS_METADATA_POS(ptr) 
((uintptr_t)(641
ptr24
) & 1)
519
520
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
521
559
{
522
559
  // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
523
559
  if (
!559
CCV_NNC_IS_METADATA_POS559
(vt_tensor))
524
233
    return vt_tensor;
525
326
  ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
526
326
  if (
tensor->alias_ref && 326
CCV_NNC_IS_METADATA_POS49
(tensor->alias_ref))
527
8
    tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
528
326
  if (CCV_IS_TENSOR_MULTIVIEW(tensor))
529
21
  {
530
21
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
531
21
    int i;
532
21
    const int count = mv->kind + mv->repeat;
533
67
    for (i = 0; 
i < count67
;
i++46
)
534
46
      
CCV_NNC_MULTIVIEW_DATA46
(mv)[i] = _ccv_nnc_tensor_metadata_rewire(tensor_metadata, 46
CCV_NNC_MULTIVIEW_DATA46
(mv)[i]);
535
21
    // No need to recursively do parent pointer, otherwise we are in deep rewire.
536
21
    if (
mv->p && 21
CCV_NNC_IS_METADATA_POS7
(mv->p))
537
7
      mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
538
21
    if (mv->rtvs)
539
16
      
for (i = 0; 7
i < mv->rtvs->rnum16
;
i++9
)
540
9
      {
541
9
        ccv_nnc_tensor_t** tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->rtvs, i);
542
9
        *tensor = _ccv_nnc_tensor_metadata_rewire(tensor_metadata, *tensor);
543
9
        assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
544
9
      }
545
21
  }
546
326
  return tensor;
547
559
}
548
549
static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, int* const tensor_block_pos, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
550
32
{
551
32
  // If already have pos assigned, return.
552
32
  if (tensor_block_pos[block_ref])
553
18
    return tensor_block_pos[block_ref];
554
14
  int i;
555
14
  const int nth_unroll = prep->nth_unroll;
556
14
  int vt_ref = prep->alloc_prep->vt_blocks[block_ref];
557
14
  assert(block_ref == prep->alloc_prep->blocks[vt_ref].block_ref);
558
14
  const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
559
14
  uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
560
14
  int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
561
14
  for (i = idx - 1; 
i >= 014
;
i--0
)
562
14
  {
563
14
    assert(p_ref >= 0);
564
14
    const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
565
14
    if (ch[i]) // Prefer the dup side of things.
566
0
      p_ref = graph_prep->dup_tensor_block_ref[p_ref * nth_unroll + ch[i] - 1];
567
14
    vt_ref = graph_prep->alloc_prep->vt_blocks[p_ref];
568
14
    const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
569
14
    offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
570
14
    // If the buffer already exists, prefer that.
571
14
    const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
572
14
    if (ptr)
573
14
    {
574
14
      // If I have any remaining path that is not covered from 0, I cannot possibly
575
14
      // have any pointer from buffer (that can only happen if it is not dup).
576
16
      for (--i; 
i >= 016
;
i--2
)
577
2
        
if (2
ch[i] != 02
)
578
0
          return 0;
579
14
      const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
580
14
      ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
581
14
      *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr + offset, params, 0);
582
14
      tensor_block_pos[block_ref] = tv_pos;
583
14
      if (prep->tensor_blocks[block_ref].companion_ref)
584
6
        tensor_block_pos[prep->tensor_blocks[block_ref].companion_ref - 1] = tv_pos;
585
14
      return tv_pos;
586
14
    }
587
0
    p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
588
0
  }
589
0
  return 0;
590
14
}
591
592
// Descent from root to the prep level, and compose multiview from there.
593
static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, int* const tensor_block_pos, const ccv_nnc_tensor_param_t params, const int preserve, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
594
28
{
595
28
  assert(pos_ref);
596
28
  int i;
597
28
  const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
598
28
  const int nth_unroll = prep->nth_unroll;
599
28
  if (prep == graph_prep)
600
14
  {
601
14
    const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, tensor_block_pos, params, preps, block_ref, ch, idx, prep);
602
14
    if (!data_pos)
603
0
      return -1;
604
14
    // Based on ch, go all the way back to find the exact pointer to compose.
605
14
    
if (14
prep->dup_tensor_block_ref &&14
606
14
      prep->dup_tensor_block_ref[block_ref * nth_unroll] >= 0 &&
607
14
      prep->dup_tensor_block_ref[block_ref * nth_unroll] != block_ref)
608
14
    {
609
14
      int pos[nth_unroll + 1];
610
14
      pos[0] = data_pos;
611
32
      for (i = 0; 
i < nth_unroll32
;
i++18
)
612
18
        pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, tensor_block_pos, params, preps, prep->dup_tensor_block_ref[block_ref * nth_unroll + i], ch, idx, prep);
613
14
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
614
14
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
615
14
      ccv_nnc_tensor_t* data[nth_unroll + 1];
616
46
      for (i = 0; 
i < nth_unroll + 146
;
i++32
)
617
32
        data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
618
14
      ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, nth_unroll + 1, prep->graph, mv);
619
46
      for (i = 0; 
i < nth_unroll + 146
;
i++32
)
620
32
        
CCV_NNC_MULTIVIEW_DATA32
(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i]32
;
621
14
      *pos_ref = mv_pos;
622
0
    } else {
623
0
      *pos_ref = data_pos;
624
0
    }
625
14
    if (preserve)
626
3
    {
627
3
      // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
628
3
      // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
629
3
      // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
630
3
      // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
631
3
      // arena allocated).
632
3
      // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
633
3
      // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
634
3
      // it to a K01 structure.
635
3
      // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
636
3
      // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
637
3
      // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
638
3
      int prev_mv_pos = *pos_ref;
639
3
      if (prev_mv_pos == -1)
640
0
      {
641
0
        prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
642
0
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
643
0
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
644
0
        ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
645
0
          tv,
646
0
        }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
647
0
        CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
648
0
      }
649
3
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
650
3
      ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
651
3
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
652
3
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
653
3
        CCV_NNC_TENSOR_PLACEHOLDER,
654
3
        (ccv_nnc_tensor_t*)prev_mv,
655
3
      }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
656
3
      prev_mv->p = (void*)(intptr_t)mv_pos;
657
3
      
CCV_NNC_MULTIVIEW_DATA3
(mv)[0] =
CCV_NNC_TENSOR_PLACEHOLDER3
;
658
3
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
659
3
      *pos_ref = mv_pos;
660
3
    }
661
14
    return 0;
662
14
  }
663
14
  ch[idx] = 0;
664
14
  int pos[nth_unroll + 1];
665
14
  pos[0] = 0;
666
14
  const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, tensor_block_pos, params, preserve, preps, graph_prep, block_ref, ch, idx + 1, pos);
667
14
  assert(retval == 0);
668
16
  for (i = 0; 
i < nth_unroll16
;
i++2
)
669
2
  {
670
2
    ch[idx] = i + 1;
671
2
    pos[i + 1] = 0;
672
2
    const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, tensor_block_pos, params, preserve, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
673
2
    if (dup_retval < 0)
674
0
    {
675
0
      assert(i == 0);
676
0
      break;
677
0
    }
678
2
  }
679
14
  // If current prep has no dup.
680
14
  if (i == 0)
681
12
  {
682
12
    *pos_ref = pos[0];
683
12
    return 0;
684
12
  }
685
2
  ccv_nnc_tensor_t* data[nth_unroll + 1];
686
2
  // Compose to a new multiview.
687
6
  for (i = 0; 
i < nth_unroll + 16
;
i++4
)
688
4
    { assert(pos[i] > 0); }
689
2
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
690
6
  for (i = 0; 
i < nth_unroll + 16
;
i++4
)
691
4
    data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
692
2
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
693
2
  ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, nth_unroll + 1, prep->graph, mv);
694
6
  for (i = 0; 
i < nth_unroll + 16
;
i++4
)
695
4
    
if (4
data[i] != 4
CCV_NNC_TENSOR_PLACEHOLDER4
&&
CCV_IS_TENSOR_MULTIVIEW4
(data[i]))
696
4
      ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
697
6
  for (i = 0; 
i < nth_unroll + 16
;
i++4
)
698
4
    
CCV_NNC_MULTIVIEW_DATA4
(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i]4
;
699
2
  *pos_ref = mv_pos;
700
2
  return 0;
701
14
}
702
703
static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
704
43
{
705
43
  int i;
706
43
  int is_input = 0;
707
95
  for (i = 0; 
i < node->input_size && 95
!is_input58
;
i++52
)
708
52
    
if (52
p_ref == node->inputs[i]52
)
709
33
      is_input = 1;
710
43
  int is_output = 0;
711
75
  for (i = 0; 
i < node->output_size && 75
!is_output32
;
i++32
)
712
32
    
if (32
p_ref == node->outputs[i]32
)
713
10
      is_output = 1;
714
43
  assert(!(is_input && is_output));
715
43
  if (is_input)
716
33
    return -1;
717
10
  
if (10
is_output10
)
718
10
    return 1;
719
0
  return 0;
720
10
}
721
722
static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
723
18
{
724
18
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
725
18
  // If p is not input, no need to preserve at all.
726
18
  if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
727
2
    return 0;
728
16
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
729
16
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
730
16
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
731
16
  /* This needs detailed explanation, what does preserve mean?
732
16
   * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
733
16
   * also used outside of the while loop, we cannot reuse the memory region of x for
734
16
   * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
735
16
   * y uses the same memory region as x). The way to workaround this is by using a different
736
16
   * memory region for y = x + 1, but for the first iteration, having x pointing to the
737
16
   * original. During the allocation process, the way to identify whether x should preserve
738
16
   * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
739
16
   * parent tensor is the same as the memory region it plans to use in the buffer, then we are
740
16
   * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
741
16
   * it is the input tensor whenever that is possible. A tensor block can point to two parent
742
16
   * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
743
16
   * tensor whenever that is possible. */
744
16
  if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
745
10
    return 0;
746
16
  // Otherwise, return 1 because we now need to preserve.
747
6
  return 1;
748
16
}
749
750
static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
751
11
{
752
11
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
753
11
  int i;
754
34
  for (i = 0; 
i < mv->kind + mv->repeat34
;
i++23
)
755
23
    
if (23
CCV_NNC_MULTIVIEW_DATA23
(mv)[i] == 23
CCV_NNC_TENSOR_PLACEHOLDER23
)
756
5
      
CCV_NNC_MULTIVIEW_DATA5
(mv)[i] = tensor5
;
757
18
    else 
if (18
CCV_IS_TENSOR_MULTIVIEW18
(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
758
5
      
_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)5
CCV_NNC_MULTIVIEW_DATA5
(mv)[i], tensor);
759
11
}
760
761
static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
762
11
{
763
11
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
764
11
  int i;
765
34
  for (i = 0; 
i < mv->kind + mv->repeat34
;
i++23
)
766
23
  {
767
23
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]))
768
5
      
CCV_NNC_MULTIVIEW_DATA5
(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)5
CCV_NNC_MULTIVIEW_DATA5
(mv)[i]);
769
23
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
770
5
      
_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)5
CCV_NNC_MULTIVIEW_DATA5
(mv)[i]);
771
23
  }
772
11
}
773
774
static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, int* const tensor_block_pos, const int preserve, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
775
12
{
776
12
  // Go to the root of the graph.
777
12
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
778
12
  int i;
779
26
  for (i = 1; 
prep->p26
;
i++14
)
780
14
    prep = prep->p;
781
12
  // Root graph should have no dup tensor blocks.
782
12
  assert(!prep->dup_tensor_block_ref);
783
12
  const int c = i;
784
12
  const ccv_nnc_symbolic_graph_prep_t* preps[c];
785
12
  prep = graph_prep;
786
12
  preps[c - 1] = prep;
787
26
  for (i = 0; 
prep->p26
;
i++14
)
788
14
    preps[c - 2 - i] = prep = prep->p;
789
12
  int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
790
12
  memset(ch, 0, sizeof(int) * c);
791
12
  int pos = 0;
792
12
  _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, tensor_block_pos, params, preserve, preps, graph_prep, block_ref, ch, 0, &pos);
793
12
  assert(ch[c - 1] == 0); // This shouldn't never be modified.
794
12
  assert(pos > 0);
795
12
  return pos;
796
12
}
797
798
static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, int* const tensor_block_pos, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
799
2
{
800
2
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
801
2
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
802
2
  ccv_nnc_tensor_t* const tv = 
CCV_NNC_IS_METADATA_POS2
(tensor) ?
_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor)2
:
tensor0
;
803
2
  ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
804
2
    CCV_NNC_TENSOR_PLACEHOLDER,
805
2
    tv,
806
2
  }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
807
2
  
CCV_NNC_MULTIVIEW_DATA2
(mv)[0] =
CCV_NNC_TENSOR_PLACEHOLDER2
;
808
2
  CCV_NNC_MULTIVIEW_DATA(mv)[1] = tensor;
809
2
  return mv_pos;
810
2
}
811
812
static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
813
29
{
814
29
  // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
815
29
  // Each tensor have the designation in assigned array, and offset in allocated_offset.
816
29
  const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
817
29
  const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
818
29
  const int tensor_block_size = graph_prep->tensor_block_size;
819
29
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
820
29
  const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
821
29
  const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
822
19
  const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? 
p_graph_prep->alloc_prep10
:
019
;
823
29
  const int nth_unroll = graph_prep->nth_unroll;
824
29
  int i, j;
825
29
  int m_tensor_size = 0;
826
309
  for (i = 0; 
i < tensor_symbol_info_size309
;
i++280
)
827
29
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
828
280
    
if (280
!280
TENSOR_EXPECT_UNASSIGNED280
(tensor_blocks[i]))
829
228
      ++m_tensor_size;
830
29
  ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(ccv_nnc_tensor_t*) * m_tensor_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size);
831
29
  graph_prep->tensor_arena = tensor_arena;
832
29
  tensor_arena->graph_ref = graph_prep->graph_ref;
833
29
  tensor_arena->buffers = (void*)(tensor_arena + 1);
834
29
  tensor_arena->buffer_size = alloc_prep->buffer_size;
835
29
  tensor_arena->vt_tensor_size = tensor_symbol_info_size;
836
29
  tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
837
29
  tensor_arena->m_tensor_size = m_tensor_size;
838
29
  tensor_arena->m_tensors = (ccv_nnc_tensor_t**)(tensor_arena->vt_tensors + tensor_symbol_info_size);
839
29
  tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->m_tensors + m_tensor_size);
840
29
  tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
841
29
  tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
842
189
  for (i = 0; 
i < alloc_prep->buffer_size189
;
i++160
)
843
160
    tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
844
29
  int memory_type = CCV_TENSOR_GET_MEMORY(alloc_prep->buffers[0].type);
845
29
  int device_id = CCV_TENSOR_GET_DEVICE_ID(alloc_prep->buffers[0].type);
846
160
  for (i = 1; 
i < alloc_prep->buffer_size160
;
i++131
)
847
131
  {
848
131
    assert(CCV_TENSOR_GET_MEMORY(alloc_prep->buffers[i].type) == memory_type);
849
131
    assert(CCV_TENSOR_GET_DEVICE_ID(alloc_prep->buffers[i].type) == device_id);
850
131
  }
851
351
  for (i = 0; 
i < graph_prep->tensor_block_size351
;
i++322
)
852
322
  {
853
322
    assert(CCV_TENSOR_GET_MEMORY(tensor_blocks[i].type) == memory_type);
854
322
    assert(CCV_TENSOR_GET_DEVICE_ID(tensor_blocks[i].type) == device_id);
855
322
  }
856
29
  tensor_arena->memory_type = memory_type;
857
29
  tensor_arena->device_id = device_id;
858
29
  assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep));
859
29
  if (
p_arena && 29
p_graph_prep10
)
860
10
  {
861
10
    // Don't need to allocate the actual buffer, just use the pointer from the above.
862
10
    PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena);
863
46
    for (i = 0; 
i < tensor_arena->buffer_size46
;
i++36
)
864
36
    {
865
36
      const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
866
36
      assert(p_ref >= 0);
867
36
      const int p_nth_unroll = p_graph_prep->nth_unroll;
868
36
      if (p_graph_prep->dup_tensor_block_ref &&
869
4
        p_graph_prep->dup_tensor_block_ref[p_ref * p_nth_unroll] >= 0 &&
870
4
        p_graph_prep->dup_tensor_block_ref[p_ref * p_nth_unroll] != p_ref)
871
2
      {
872
2
        // This condition means in the parent graph, we point to multiple tensor blocks for the same
873
2
        // buffer, therefore, we cannot have one single pointer assigned in this case.
874
2
        // Later we will handle this by generate ccv_tensor_multiview_t structure.
875
2
        tensor_arena->buffers[i].ptr = 0;
876
2
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
877
2
        continue;
878
2
      }
879
36
      // Otherwise, find the actual buffer pointer.
880
34
      const int vt_ref = p_alloc_prep->vt_blocks[p_ref];
881
34
      assert(vt_ref >= 0);
882
34
      const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
883
34
      if (!p_arena->buffers[buffer_ref].ptr)
884
0
      {
885
0
        // Pass it down as 0 ptr.
886
0
        tensor_arena->buffers[i].ptr = 0;
887
0
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
888
0
        continue;
889
0
      }
890
34
      const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
891
34
      tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
892
34
      PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset);
893
34
    }
894
19
  } else {
895
19
    // Now, allocate actual buffers.
896
19
    PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena);
897
19
#ifdef HAVE_CUDA
898
19
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
899
0
    {
900
0
      for (i = 0; 
i < tensor_arena->buffer_size0
;
i++0
)
901
0
      {
902
0
        tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
903
0
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
904
0
      }
905
19
    } else {
906
19
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
907
143
      for (i = 0; 
i < tensor_arena->buffer_size143
;
i++124
)
908
124
      {
909
124
        ccmemalign((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size);
910
124
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
911
124
      }
912
19
    }
913
19
#else
914
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
915
    for (i = 0; i < tensor_arena->buffer_size; i++)
916
    {
917
      ccmemalign((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size);
918
      PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
919
    }
920
#endif
921
19
  }
922
29
  // Go over sub_preps and allocate arenas for them. Do it this early because
923
29
  // we may reference tensors from sub arenas, the reason why we need to reference
924
29
  // tensors from sub arenas is because for output tensors, sub arena's tensor
925
29
  // will have automatic reference updates.
926
40
  for (i = 0; 
i < tensor_arena->sub_arena_size40
;
i++11
)
927
29
    // TODO: I also need to pass binded tensor properly to the lower level.
928
11
    
if (11
graph_prep->sub_preps[i]11
)
929
10
      tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], tensor_arena, 0, 0);
930
11
    else
931
1
      tensor_arena->sub_arenas[i] = 0;
932
29
  memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
933
29
  int* const tensor_block_pos = tensor_block_size > 0 ? 
(int*)29
cccalloc29
(tensor_block_size, sizeof(int)) :
00
;
934
29
  // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
935
19
  ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? 
(ccv_nnc_tensor_t**)10
cccalloc10
(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) :
019
;
936
40
  for (i = 0; 
i < tensor_arena->sub_arena_size40
;
i++11
)
937
11
    
if (11
tensor_arena->sub_arenas[i]11
)
938
10
    {
939
10
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
940
10
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
941
16
      for (j = 0; 
j < node->output_size16
;
j++6
)
942
6
      {
943
6
        const int idx = node->outputs[j];
944
6
        const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i) - 1;
945
6
        assert(s_idx >= 0);
946
6
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
947
6
        assert(sub_arena_out_tensors[idx] == 0);
948
6
        // Only assign if it is a multiview tensor.
949
6
        if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor))
950
2
          sub_arena_out_tensors[idx] = sub_tensor;
951
6
      }
952
10
    }
953
29
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
954
299
  for (i = 0; 
i < alloc_prep->block_size299
;
i++270
)
955
270
    
if (270
alloc_prep->blocks[i].block_ref < tensor_symbol_info_size270
)
956
228
    {
957
228
      const int block_ref = alloc_prep->blocks[i].block_ref;
958
228
      const int buffer_ref = alloc_prep->blocks[i].buffer_ref;
959
228
      const uint64_t offset = alloc_prep->blocks[i].offset;
960
228
      assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]));
961
228
      if (
!228
TENSOR_EXPECT_ALIAS228
(tensor_blocks[block_ref]))
962
183
      {
963
183
        // Either we have dup_tensor_block_ref in current layer, or we have that in
964
183
        // previous layer, therefore, cannot really find the buffer ptr.
965
183
        if (
(!sub_arena_out_tensors || 183
!sub_arena_out_tensors[block_ref]23
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
966
181
          ((graph_prep->dup_tensor_block_ref &&
967
20
            graph_prep->dup_tensor_block_ref[block_ref * nth_unroll] >= 0 &&
968
20
            graph_prep->dup_tensor_block_ref[block_ref * nth_unroll] != block_ref) ||
969
169
           !tensor_arena->buffers[buffer_ref].ptr))
970
12
        {
971
12
          assert(graph_prep->p); // This must be in a sub-graph.
972
12
          // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
973
12
          if (
graph_prep->tensor_blocks[block_ref].p_refs[0] && 12
_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref)6
)
974
2
            continue;
975
10
          const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, tensor_block_pos, 0, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
976
10
          tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
977
171
        } else {
978
171
          // If already created, use the same tensor, and continue.
979
171
          if (tensor_blocks[block_ref].companion_ref &&
980
2
            tensor_block_pos[tensor_blocks[block_ref].companion_ref - 1])
981
1
          {
982
1
            tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)tensor_block_pos[tensor_blocks[block_ref].companion_ref - 1];
983
1
            continue;
984
1
          }
985
171
          // Having ptr.
986
170
          const int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
987
170
          tensor_block_pos[block_ref] = pos;
988
170
          if (tensor_blocks[block_ref].companion_ref)
989
1
            tensor_block_pos[tensor_blocks[block_ref].companion_ref - 1] = pos;
990
170
          tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
991
170
          ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
992
170
          // Also, set its allocations.
993
170
          // Since tensor view is bit compatible with tensor, we can just cast.
994
170
          *tensor = ccv_nnc_tensor(tensor_arena->buffers[buffer_ref].ptr + offset, tensor_symbol_info[block_ref].info, 0);
995
170
          assert(offset + tensor_blocks[block_ref].size <= tensor_arena->buffers[buffer_ref].size);
996
170
        }
997
183
      }
998
228
    }
999
29
  // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1000
309
  for (i = 0; 
i < tensor_symbol_info_size309
;
i++280
)
1001
29
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1002
280
    
if (280
TENSOR_EXPECT_UNASSIGNED280
(tensor_blocks[i]) && 280
tensor_blocks[i].ref52
)
1003
48
    {
1004
48
      int ref = tensor_blocks[i].ref - 1;
1005
49
      while (
!49
TENSOR_EXPECT_COMPUTABLE49
(tensor_blocks[ref]) &&
tensor_blocks[ref].ref1
)
1006
1
        ref = tensor_blocks[ref].ref - 1;
1007
48
      assert(tensor_arena->vt_tensors[ref]);
1008
48
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1009
48
    }
1010
29
  // Now after refs assigned out, handle the case I need to preserve because I am a sub graph.
1011
29
  if (graph_prep->p)
1012
10
  {
1013
10
    const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1014
10
    const int p_idx = graph_prep->p_idx - 1;
1015
22
    for (i = 0; 
i < node->input_size22
;
i++12
)
1016
12
    {
1017
12
      const int idx = node->inputs[i];
1018
12
      const int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx) - 1;
1019
12
      if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1020
8
        continue;
1021
4
      const int vt_ref = alloc_prep->vt_blocks[block_ref];
1022
4
      const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1023
4
      assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]));
1024
4
      if (
!4
TENSOR_EXPECT_ALIAS4
(tensor_blocks[block_ref]))
1025
4
      {
1026
4
        // Either we have dup_tensor_block_ref in current layer, or we have that in
1027
4
        // previous layer, therefore, cannot really find the buffer ptr.
1028
4
        if (
(!sub_arena_out_tensors || 4
!sub_arena_out_tensors[block_ref]0
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1029
4
          ((graph_prep->dup_tensor_block_ref &&
1030
2
            graph_prep->dup_tensor_block_ref[block_ref * nth_unroll] >= 0 &&
1031
2
            graph_prep->dup_tensor_block_ref[block_ref * nth_unroll] != block_ref) ||
1032
2
           !tensor_arena->buffers[buffer_ref].ptr))
1033
2
        {
1034
2
          // We haven't allocated anything for this yet.
1035
2
          assert(tensor_arena->vt_tensors[block_ref] == 0);
1036
2
          const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, tensor_block_pos, 1, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1037
2
          tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1038
2
        } else {
1039
2
          const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_block_pos, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1040
2
          tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1041
2
        }
1042
4
      }
1043
4
    }
1044
10
  }
1045
29
  // Now it is time to handle alias.
1046
299
  for (i = 0; 
i < alloc_prep->block_size299
;
i++270
)
1047
270
    
if (270
alloc_prep->blocks[i].block_ref < tensor_symbol_info_size270
)
1048
228
    {
1049
228
      const int block_ref = alloc_prep->blocks[i].block_ref;
1050
228
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1051
45
      {
1052
45
        // Assigning out the tensor aliases.
1053
45
        assert(tensor_symbol_info[block_ref].alias_ref);
1054
45
        const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1055
45
        // It referenced to is not an alias.
1056
45
        assert(tensor_arena->vt_tensors[alias_ref]);
1057
45
        const int alias_pos = (int)(intptr_t)tensor_arena->vt_tensors[alias_ref];
1058
45
        const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1059
45
        assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr));
1060
45
        // Will use that to determine whether insert reference or not.
1061
45
        const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr);
1062
53
        while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr))
1063
8
        {
1064
8
          const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1065
8
          alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1066
8
        }
1067
45
        const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1068
45
        // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1069
45
        if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1070
33
          
memcmp(tensor_symbol_info[block_ref].inc, tensor_symbol_info[block_ref].info.dim, sizeof(int) * 33
CCV_NNC_MAX_DIM_ALLOC33
) == 0)
1071
10
        {
1072
10
          const int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1073
10
          ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1074
10
          *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1075
10
          tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1076
10
          if (is_multiview)
1077
6
          {
1078
6
            ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1079
6
            tensor->alias_ref = (uintptr_t)alias_pos;
1080
6
            ccv_nnc_tensor_reference_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1081
6
          }
1082
35
        } else {
1083
35
          const int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1084
35
          ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1085
35
          // Otherwise initialize a tensor view
1086
35
          // 1). Simple case, if the inc is equal to original tensor, just init a tensor view.
1087
35
          if (
memcmp(alias_tensor.info.dim, tensor_symbol_info[block_ref].inc, sizeof(int) * 35
CCV_NNC_MAX_DIM_ALLOC35
) == 0)
1088
33
            *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].info.dim);
1089
2
          else {
1090
2
            // Otherwise, create the tensor first, and then create the tensor view off the new tensor.
1091
2
            ccv_nnc_tensor_param_t info = tensor_symbol_info[block_ref].info;
1092
2
            memcpy(info.dim, tensor_symbol_info[block_ref].inc, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC);
1093
2
            assert(ccv_nnc_tensor_count(info) <= ccv_nnc_tensor_count(alias_tensor.info));
1094
2
            ccv_nnc_tensor_t tensor = ccv_nnc_tensor(alias_tensor.data.u8, info, 0);
1095
2
            *tensor_view = ccv_nnc_tensor_view(&tensor, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].info.dim);
1096
2
          }
1097
35
          tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1098
35
          if (is_multiview)
1099
2
          {
1100
2
            ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1101
2
            tensor_view->alias_ref = (uintptr_t)alias_pos;
1102
2
            ccv_nnc_tensor_reference_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1103
2
          }
1104
35
        }
1105
45
      }
1106
228
    }
1107
29
  // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1108
40
  for (i = 0; 
i < tensor_arena->sub_arena_size40
;
i++11
)
1109
11
    
if (11
tensor_arena->sub_arenas[i]11
)
1110
10
    {
1111
10
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1112
10
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1113
22
      for (j = 0; 
j < node->input_size22
;
j++12
)
1114
12
      {
1115
12
        const int idx = node->inputs[j];
1116
12
        const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i) - 1;
1117
12
        assert(s_idx >= 0);
1118
12
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1119
12
        // Only do the replacement if it is a multi-view tensor.
1120
12
        if (
CCV_IS_TENSOR_MULTIVIEW12
(sub_tensor) && 12
!6
TENSOR_EXPECT_UNASSIGNED6
(tensor_blocks[idx]))
1121
6
        {
1122
6
          const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1123
6
          // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1124
6
          // to this tensor.
1125
6
          if (CCV_IS_TENSOR_MULTIVIEW((ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)))
1126
1
          {
1127
1
            const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1128
1
            ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1129
1
            ccv_nnc_tensor_multiview_t* multiview = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1130
1
            ref_tensor->alias_ref = (uintptr_t)vt_pos;
1131
1
            ccv_nnc_tensor_reference_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1132
1
            ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(
CCV_NNC_IS_METADATA_POS1
(CCV_NNC_MULTIVIEW_DATA(multiview)[0]) ?
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)1
CCV_NNC_MULTIVIEW_DATA1
(multiview)[0]) :
CCV_NNC_MULTIVIEW_DATA0
(multiview)[0]0
);
1133
1
            while (CCV_IS_TENSOR_MULTIVIEW(tv))
1134
0
              
tv = (ccv_nnc_tensor_t*)(0
CCV_NNC_IS_METADATA_POS0
(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) ?
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)0
CCV_NNC_MULTIVIEW_DATA0
((ccv_nnc_tensor_multiview_t*)tv)[0]) :
CCV_NNC_MULTIVIEW_DATA0
((ccv_nnc_tensor_multiview_t*)tv)[0]0
);
1135
1
            *ref_tensor = ccv_nnc_tensor(tv->data.ptr, tv->info, 0);
1136
1
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1137
1
          } else
1138
5
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1139
6
        }
1140
12
      }
1141
10
    }
1142
29
  // Everything is done, rewire vt_tensor to real locations. From now on, no push to tensor_metadata is possible.
1143
309
  for (i = 0, j = 0; 
i < tensor_symbol_info_size309
;
i++280
)
1144
280
    
if (280
!280
TENSOR_EXPECT_UNASSIGNED280
(tensor_blocks[i]) &&
tensor_arena->vt_tensors[i]228
)
1145
228
      tensor_arena->m_tensors[j++] = tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1146
29
  assert(j == m_tensor_size);
1147
29
  // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1148
309
  for (i = 0; 
i < tensor_symbol_info_size309
;
i++280
)
1149
280
    
if (280
tensor_arena->vt_tensors[i]280
)
1150
276
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1151
29
  // Associate multiview tensors from sub arena to the parent.
1152
29
  if (sub_arena_out_tensors)
1153
67
    
for (i = 0; 10
i < alloc_prep->block_size67
;
i++57
)
1154
57
      
if (57
alloc_prep->blocks[i].block_ref < tensor_symbol_info_size57
)
1155
28
      {
1156
28
        const int block_ref = alloc_prep->blocks[i].block_ref;
1157
28
        if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))
1158
0
          continue;
1159
28
        int sub_arena_ref = block_ref;
1160
28
        if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1161
5
        {
1162
5
          // Assigning out the tensor aliases.
1163
5
          assert(tensor_symbol_info[block_ref].alias_ref);
1164
5
          const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1165
5
          // It referenced to is not an alias.
1166
5
          assert(tensor_arena->vt_tensors[alias_ref]);
1167
5
          sub_arena_ref = alias_ref;
1168
5
          if (!sub_arena_out_tensors[sub_arena_ref])
1169
3
            continue;
1170
5
        }
1171
25
        
if (25
!sub_arena_out_tensors[sub_arena_ref]25
)
1172
21
          continue;
1173
4
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)sub_arena_out_tensors[sub_arena_ref];
1174
4
        assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1175
4
        tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1176
4
        ccv_nnc_tensor_reference_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1177
4
      }
1178
29
  if (sub_arena_out_tensors)
1179
10
    
ccfree10
(sub_arena_out_tensors)10
;
1180
29
  if (tensor_block_pos)
1181
29
    
ccfree29
(tensor_block_pos)29
;
1182
29
  // Handle binded tensors.
1183
31
  for (i = 0; 
i < tensor_bind_size31
;
i++2
)
1184
2
  {
1185
2
    if (!tensor_binds[i].tensor) // If there is no tensor binded, it is a constant, we allocated in arena.
1186
0
      continue;
1187
2
    // For binded tensors, it shouldn't be assigned yet.
1188
2
    assert(tensor_arena->vt_tensors[tensor_binds[i].symbol.d] == 0);
1189
2
    // I have to cast this, unfortunately.
1190
2
    tensor_arena->vt_tensors[tensor_binds[i].symbol.d] = (ccv_nnc_tensor_t*)tensor_binds[i].tensor;
1191
2
  }
1192
29
  // Rewire sub arena's tensor references.
1193
40
  for (i = 0; 
i < tensor_arena->sub_arena_size40
;
i++11
)
1194
11
    
if (11
tensor_arena->sub_arenas[i]11
)
1195
10
    {
1196
10
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1197
10
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1198
22
      for (j = 0; 
j < node->input_size22
;
j++12
)
1199
12
      {
1200
12
        const int idx = node->inputs[j];
1201
12
        const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i) - 1;
1202
12
        assert(s_idx >= 0);
1203
12
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1204
12
        // Only do the replacement if it is a multi-view tensor.
1205
12
        if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor))
1206
6
        {
1207
6
          // This is binded tensor, bind it now.
1208
6
          if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx]))
1209
0
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1210
6
          else
1211
6
            _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1212
6
        }
1213
12
      }
1214
10
    }
1215
29
  return tensor_arena;
1216
29
}
1217
1218
static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1219
613
{
1220
613
  int i, found = 0;
1221
613
  // Try to insert head.
1222
613
  ccv_array_t* head = tensor_blocks.head;
1223
642
  for (i = 0; i < head->rnum;)
1224
360
  {
1225
360
    const int head_idx = *(int*)ccv_array_get(head, i);
1226
360
    if (head_idx == idx)
1227
54
    {
1228
54
      found = 1;
1229
54
      break;
1230
54
    }
1231
306
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1232
306
    if (
cell.i32 && 306
cell.i32[0] > 02
)
1233
2
    {
1234
2
      /* If the current node is the parent of the head node, check if we found it or not. */
1235
2
      /* If not found, replace the current one. */
1236
2
      if (!found)
1237
2
      {
1238
2
        found = 1;
1239
2
        *(int*)ccv_array_get(head, i) = idx;
1240
0
      } else {
1241
0
        /* Remove the current one, change the rnum. */
1242
0
        if (i < head->rnum - 1)
1243
0
          
*(int*)0
ccv_array_get0
(head, i) = *(int*)
ccv_array_get0
(head, head->rnum - 1);
1244
0
        --head->rnum;
1245
0
        continue;
1246
0
      }
1247
304
    } else {
1248
304
      // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1249
304
      cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1250
304
      if (
cell.i32 && 304
cell.i32[0] > 0277
)
1251
277
      {
1252
277
        found = 1;
1253
277
        break;
1254
277
      }
1255
304
    }
1256
306
    /* Advancing i. */
1257
29
    ++i;
1258
29
  }
1259
613
  /* If not found, push this idx to the end of the array. */
1260
613
  if (!found)
1261
280
    ccv_array_push(head, &idx);
1262
613
  // Try to insert tail.
1263
613
  found = 0;
1264
613
  ccv_array_t* tail = tensor_blocks.tail;
1265
892
  for (i = 0; i < tail->rnum;)
1266
379
  {
1267
379
    const int tail_idx = *(int*)ccv_array_get(tail, i);
1268
379
    if (tail_idx == idx)
1269
81
    {
1270
81
      found = 1;
1271
81
      break;
1272
81
    }
1273
298
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1274
298
    if (
cell.i32 && 298
cell.i32[0] > 0233
)
1275
233
    {
1276
233
      /* If the current node is the child of the tail node, check if we found it or not. */
1277
233
      /* If not found, replace the current one. */
1278
233
      if (!found)
1279
218
      {
1280
218
        found = 1;
1281
218
        *(int*)ccv_array_get(tail, i) = idx;
1282
15
      } else {
1283
15
        /* Remove the current one, change the rnum. */
1284
15
        *(int*)
ccv_array_get15
(tail, i) = *(int*)
ccv_array_get15
(tail, tail->rnum - 1);
1285
15
        --tail->rnum;
1286
15
        continue;
1287
15
      }
1288
65
    } else {
1289
65
      // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1290
65
      cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1291
65
      if (
cell.i32 && 65
cell.i32[0] > 019
)
1292
19
      {
1293
19
        found = 1;
1294
19
        break;
1295
19
      }
1296
65
    }
1297
298
    /* Advancing i. */
1298
264
    ++i;
1299
264
  }
1300
613
  /* If not found, push this idx to the end of the array. */
1301
613
  if (!found)
1302
295
    ccv_array_push(tail, &idx);
1303
613
}
1304
1305
ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1306
173
{
1307
173
  if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1308
149
  {
1309
149
    assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size);
1310
149
    ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1311
149
    if (
tensor && 149
CCV_IS_TENSOR_MULTIVIEW147
(tensor))
1312
7
    {
1313
7
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1314
14
      while (CCV_IS_TENSOR_MULTIVIEW(mv))
1315
7
        
mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? 7
mv->it1
:
CCV_NNC_MULTIVIEW_DATA6
(mv)[0]6
);
1316
7
      return (ccv_nnc_tensor_t*)mv;
1317
7
    }
1318
142
    return tensor;
1319
149
  }
1320
24
  int i;
1321
24
  for (i = 0; 
i < tensor_arena->sub_arena_size24
;
i++0
)
1322
24
    
if (24
tensor_arena->sub_arenas[i]24
)
1323
24
    {
1324
24
      ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1325
24
      if (tensor)
1326
24
        return tensor;
1327
24
    }
1328
0
  return 0;
1329
24
}
1330
1331
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1332
29
{
1333
29
  if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1334
29
  {
1335
29
    assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size);
1336
29
    return graph_exec_arena->graph_execs[symbol.d];
1337
29
  }
1338
0
  int i;
1339
0
  for (i = 0; 
i < graph_exec_arena->sub_arena_size0
;
i++0
)
1340
0
    
if (0
graph_exec_arena->sub_arenas[i]0
)
1341
0
    {
1342
0
      ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1343
0
      if (
!0
CCV_NO_GRAPH_EXEC0
(exec))
1344
0
        return exec;
1345
0
    }
1346
0
  return (ccv_nnc_graph_exec_t){}; // 0.
1347
0
}
1348
1349
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1350
22
{
1351
22
  return graph_exec_arena->source;
1352
22
}
1353
1354
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1355
22
{
1356
22
  return graph_exec_arena->destination;
1357
22
}
1358
1359
// Check whether the head is the beginning of this block.
1360
static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1361
5
{
1362
5
  assert(tensor_block->head);
1363
5
  return (tensor_block->head->rnum == 1 && 
*(int*)5
ccv_array_get5
(tensor_block->head, 0) == head_node);
1364
5
}
1365
1366
// Check whether the tail is the end of this block.
1367
static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
1368
9
{
1369
9
  assert(tensor_block->tail);
1370
9
  return (tensor_block->tail->rnum == 1 && 
*(int*)9
ccv_array_get9
(tensor_block->tail, 0) == tail_node);
1371
9
}
1372
1373
// Make two tensor blocks one. Return 1 if that happened.
1374
static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
1375
59
{
1376
59
  // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
1377
59
  if (
!59
TENSOR_EXPECT_CONST59
(tensor_blocks[p_ref_0]) &&
1378
59
    
!59
TENSOR_EXPECT_CONST59
(tensor_blocks[p_ref_1]) &&
1379
59
    tensor_blocks[p_ref_0].tail->rnum == 1 &&
1380
59
    tensor_blocks[p_ref_1].head->rnum == 1 &&
1381
59
    
*(int*)59
ccv_array_get59
(tensor_blocks[p_ref_0].tail, 0) == *(int*)
ccv_array_get59
(tensor_blocks[p_ref_1].head, 0))
1382
48
  {
1383
48
    // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
1384
48
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]));
1385
48
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]));
1386
48
    ccv_array_free(tensor_blocks[p_ref_0].tail);
1387
48
    tensor_blocks[p_ref_0].tail= tensor_blocks[p_ref_1].tail;
1388
48
    if (tensor_blocks[p_ref_1].p_refs[0])
1389
4
    {
1390
4
      assert(tensor_blocks[p_ref_1].p_refs[1] == 0); // It simply cannot have more than one p_refs, otherwise we cannot merge.
1391
4
      if (!tensor_blocks[p_ref_0].p_refs[0])
1392
1
        tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
1393
4
      else
1394
3
        tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
1395
4
    }
1396
48
    TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]));
1397
48
    ccv_array_free(tensor_blocks[p_ref_1].head);
1398
48
    TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1]);
1399
48
    tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
1400
48
    if (!tensor_blocks[p_ref_0].r_refs)
1401
38
      tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
1402
48
    ccv_array_replace_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1, p_ref_1 + 1);
1403
48
    tensor_blocks[p_ref_1].size = 0;
1404
48
    tensor_blocks[p_ref_1].head = 0;
1405
48
    tensor_blocks[p_ref_1].tail = 0;
1406
48
    return 1;
1407
48
  }
1408
11
  return 0;
1409
59
}
1410
1411
static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
1412
34
{
1413
34
  int i, j;
1414
34
  // Generate exec dependencies (or, in other words, partial ordering of executions).
1415
34
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
1416
34
  int* buf = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
1417
34
  int buf_size;
1418
34
#define for_block(x, val) \
1419
908
  
do 908
{ \908
1420
908
    if (((int32_t*)val)[0] > 0) \
1421
908
    { \
1422
908
      buf[buf_size * 2] = x; \
1423
908
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
1424
908
      ++buf_size; \
1425
908
    } \
1426
908
  } while (0)
1427
162
  
ccv_nnc_graph_visit_for162
(visit, exec_symbol_info, node, idx, _, term) {162
1428
162
    buf_size = 0; /* save all its parent deps to this buffer */
1429
162
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
1430
162
    if (vector)
1431
908
      
CCV_SPARSE_VECTOR_FOREACH124
(exec_dep, vector,
for_block908
);
1432
162
    
if (162
!node->outgoings162
)
1433
34
      break;
1434
290
    
for (i = 0; 128
i < node->outgoings->rnum290
;
i++162
)
1435
162
    {
1436
162
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
1437
162
      const int32_t one = 1;
1438
162
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
1439
162
      /* If not found, set, if the current node is the destination node, no need 
1440
162
       * set itself as parent of subsequent nodes because its terminal nature. */
1441
162
      if (
!term && 162
(!cell.i32 || 155
cell.i32[0] == 00
))
1442
155
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
1443
1.00k
      for (j = 0; 
j < buf_size1.00k
;
j++846
) /* set with all idx's dependencies as well */
1444
846
      {
1445
846
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
1446
846
        /* If not found, set */
1447
846
        if (
!cell.i32 || 846
cell.i32[0] == 093
)
1448
753
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
1449
93
        else {
1450
93
          /* Otherwise, set to the longest one */
1451
93
          int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
1452
93
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
1453
93
        }
1454
846
      }
1455
162
    }
1456
128
  } ccv_nnc_graph_visit_endfor
1457
34
#undef for_block
1458
34
  
ccfree34
(buf);34
1459
34
  // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
1460
34
  const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
1461
34
  ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
1462
34
  // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
1463
34
  // happens that I have to loop through all relevant node to find out if one is used or not.
1464
362
  for (i = 0; 
i < symbolic_graph->tensor_symbol_info->rnum362
;
i++328
)
1465
328
    tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type;
1466
162
  
ccv_nnc_graph_visit_for162
(visit, exec_symbol_info, node, idx) {162
1467
486
    for (i = 0; 
i < node->input_size486
;
i++324
)
1468
324
      
if (324
node->inputs[i] >= 0324
)
1469
291
        tensor_blocks[node->inputs[i]].flags = 0;
1470
318
    for (i = 0; 
i < node->output_size318
;
i++156
)
1471
156
      
if (156
node->outputs[i] >= 0156
)
1472
153
        tensor_blocks[node->outputs[i]].flags = 0;
1473
162
  } ccv_nnc_graph_visit_endfor
1474
362
  for (i = 0; 
i < symbolic_graph->tensor_symbol_info->rnum362
;
i++328
)
1475
328
  {
1476
328
    // Check no tensor info is auto now.
1477
328
    assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info));
1478
328
    if (tensor_symbol_info[i].alias_ref)
1479
63
    {
1480
63
      // An alias cannot ref to another alias.
1481
63
      assert(!tensor_symbol_info[tensor_symbol_info[i].alias_ref - 1].alias_ref);
1482
63
      tensor_blocks[i].flags = ALIAS;
1483
63
      tensor_blocks[i].ref = tensor_symbol_info[i].alias_ref; // Assign the ref.
1484
63
      const int ref = tensor_blocks[i].ref - 1;
1485
63
      // If the referenced one is unassigned, at list first make it assigned.
1486
63
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]))
1487
16
        tensor_blocks[ref].flags = 0;
1488
63
      if (!tensor_blocks[ref].r_refs)
1489
48
        tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
1490
63
      ccv_array_replace_int(tensor_blocks[ref].r_refs, i + 1, i + 1);
1491
63
    }
1492
328
  }
1493
34
  // Ignore tensors that are already binded, no matter if it is used or not.
1494
36
  for (i = 0; 
i < tensor_bind_size36
;
i++2
)
1495
34
    // If there is a tensor binded, then it is unassigned, otherwise, we will allocate as constant.
1496
2
    
tensor_blocks[tensor_binds[i].symbol.d].flags = tensor_binds[i].tensor ? 2
UNASSIGNED2
:
CONST_TENSOR0
;
1497
362
  for (i = 0; 
i < symbolic_graph->tensor_symbol_info->rnum362
;
i++328
)
1498
328
  {
1499
328
    // Check no tensor info is auto now.
1500
328
    assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info));
1501
328
    // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
1502
328
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
1503
261
    {
1504
261
      tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
1505
261
      tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
1506
261
      // Cache tensor size (align to 16 bytes).
1507
261
      tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
1508
261
    }
1509
328
    // If there is a p_ref, add the one to the p_refs list.
1510
328
    if (tensor_symbol_info[i].p_ref)
1511
24
      tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
1512
328
  }
1513
162
  
ccv_nnc_graph_visit_for162
(visit, exec_symbol_info, node, idx) {162
1514
486
    for (i = 0; 
i < node->input_size486
;
i++324
)
1515
324
    {
1516
324
      int d = node->inputs[i];
1517
324
      if (d < 0)
1518
33
        continue;
1519
291
      tensor_blocks[d].flags |= READ_ONLY;
1520
291
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1521
55
        d = tensor_symbol_info[d].alias_ref - 1;
1522
291
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
1523
1
        continue;
1524
290
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
1525
290
      /* If this is first encounter, its head starts (this tensor is init'ed outside of the grap
1526
290
       * from the very beginning of the graph life-cycle and ends here. */
1527
290
      if (tensor_blocks[d].head->rnum == 0)
1528
116
      {
1529
245
        for (j = 0; 
j < source_size245
;
j++129
)
1530
129
          _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
1531
116
        /* If this is a read-only (based on SSA, if first encountered as read), and this is
1532
116
         * sub-graph, it is not assign_ref from anywhere (not a parameterized loop).  We cann
1533
116
         * reuse this region of memory anyway (because on second loop, we want to read the sa
1534
116
         * value out). Mark it to the end of the graph. */
1535
116
        if (
symbolic_graph->p && 116
!tensor_symbol_info[d].assign_ref39
)
1536
44
          
for (j = 0; 22
j < destination_size44
;
j++22
)
1537
22
            _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
1538
116
      }
1539
290
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
1540
290
    }
1541
318
    for (i = 0; 
i < node->output_size318
;
i++156
)
1542
156
    {
1543
156
      int d = node->outputs[i];
1544
156
      if (d < 0)
1545
3
        continue;
1546
153
      tensor_blocks[d].flags |= WRITE_ONLY;
1547
153
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1548
30
        d = tensor_symbol_info[d].alias_ref - 1;
1549
153
      if (
TENSOR_EXPECT_CONST153
(tensor_blocks[d]) ||153
1550
153
        TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
1551
1
        continue;
1552
152
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
1553
152
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
1554
152
    }
1555
162
  } ccv_nnc_graph_visit_endfor
1556
162
  
ccv_nnc_graph_visit_for162
(visit, exec_symbol_info, node, idx) {162
1557
162
    /* Remove tensor symbols that is for in-place operations (and it matches the start, end tensor). */
1558
162
    if (ccv_nnc_cmd_attr(node->cmd, CCV_NNC_CMD_ATTR_INPLACE))
1559
78
    {
1560
78
      int x, y;
1561
236
      for (x = 0; 
x < node->input_size236
;
x++158
)
1562
158
      {
1563
158
        /* If the input is not assigned, it can be referenced, find the referenced one */
1564
158
        int ref = node->inputs[x];
1565
158
        if (ref < 0)
1566
26
          continue;
1567
183
        
while (132
!183
TENSOR_EXPECT_COMPUTABLE183
(tensor_blocks[ref]) &&
tensor_blocks[ref].ref51
)
1568
51
          ref = tensor_blocks[ref].ref - 1;
1569
132
        assert(tensor_blocks[ref].ref == 0);
1570
132
        const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
1571
132
        if (
!132
TENSOR_EXPECT_CONST132
(tensor_blocks[ref]) &&
1572
132
          TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
1573
132
          tensor_blocks[ref].tail->rnum == 1)
1574
218
          
for (y = 0; 105
y < node->output_size218
;
y++113
)
1575
105
            /* Only proceed if the input symbol is different from the output symbol, */
1576
105
            /* and the input symbol meets the output symbol exactly at the same spot. */
1577
113
            
if (113
node->outputs[y] >= 0 &&113
1578
111
              ref != node->outputs[y] &&
1579
111
              
!111
TENSOR_EXPECT_CONST111
(tensor_blocks[node->outputs[y]]) &&
1580
111
              TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]]))
1581
67
            {
1582
67
              const int node_output_y = node->outputs[y];
1583
67
              const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
1584
67
              /* If dimension matches perfectly, then we can assign y_symbol to x. */
1585
67
              if (
memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * 67
CCV_NNC_MAX_DIM_ALLOC67
) == 0)
1586
56
                _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
1587
67
            }
1588
132
      }
1589
78
    }
1590
162
  } ccv_nnc_graph_visit_endfor
1591
34
  *r_exec_dep = exec_dep;
1592
34
  *r_tensor_blocks = tensor_blocks;
1593
34
}
1594
1595
static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
1596
12
{
1597
12
  if (
cmd.cmd == CCV_NNC_GRAPH_FORWARD || 12
cmd.cmd == CCV_NNC_GRAPH_BACKWARD11
)
1598
1
  {
1599
1
    ccv_nnc_cmd_t retval = cmd;
1600
1
    retval.cmd = CCV_NNC_NOOP;
1601
1
    return retval;
1602
1
  }
1603
11
  return cmd;
1604
12
}
1605
1606
static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int n_times, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
1607
31
{
1608
31
  if (dup_tensor_block_ref[input * n_times] < 0) // No tensor ref, create one.
1609
15
  {
1610
15
    if (tensor_symbol_info[input].alias_ref)
1611
9
    {
1612
9
      const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
1613
9
      assert(tensor_symbol_info[alias_ref].alias_ref == 0);
1614
9
      ccv_nnc_tensor_symbol_t tensor_symbol;
1615
9
      if (dup_tensor_block_ref[alias_ref * n_times] < 0)
1616
3
      {
1617
3
        tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
1618
3
        dup_tensor_block_ref[alias_ref * n_times] = tensor_symbol.d;
1619
6
      } else {
1620
6
        tensor_symbol.d = dup_tensor_block_ref[alias_ref * n_times];
1621
6
        tensor_symbol.graph = dup_graph;
1622
6
      }
1623
9
      ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].inc, tensor_symbol_info[input].info, 0);
1624
9
      dup_tensor_block_ref[input * n_times] = alias_symbol.d;
1625
6
    } else {
1626
6
      ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
1627
6
      dup_tensor_block_ref[input * n_times] = tensor_symbol.d;
1628
6
    }
1629
15
  }
1630
31
  return (ccv_nnc_tensor_symbol_t) {
1631
31
    .d = dup_tensor_block_ref[input * n_times],
1632
31
    .graph = dup_graph,
1633
31
  };
1634
31
}
1635
1636
static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int n_times, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
1637
24
{
1638
24
  int i;
1639
24
  if (dup_exec_ref[idx * n_times] < 0)
1640
15
  {
1641
37
    for (i = 0; 
i < node->input_size37
;
i++22
)
1642
22
      max_inputs[i] = _ccv_nnc_dup_tensor_symbol(dup_graph, n_times, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]);
1643
24
    for (i = 0; 
i < node->output_size24
;
i++9
)
1644
9
      max_outputs[i] = _ccv_nnc_dup_tensor_symbol(dup_graph, n_times, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]);
1645
15
    ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
1646
15
    dup_exec_ref[idx * n_times] = exec_symbol.d;
1647
15
  }
1648
24
  return (ccv_nnc_graph_exec_symbol_t) {
1649
24
    .d = dup_exec_ref[idx * n_times],
1650
24
    .graph = dup_graph,
1651
24
  };
1652
24
}
1653
1654
static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
1655
34
{
1656
34
  int i;
1657
386
  for (i = 0; 
i < tensor_block_size386
;
i++352
)
1658
352
  {
1659
352
    if (tensor_blocks[i].head)
1660
237
      ccv_array_free(tensor_blocks[i].head);
1661
352
    if (tensor_blocks[i].tail)
1662
237
      ccv_array_free(tensor_blocks[i].tail);
1663
352
    if (tensor_blocks[i].r_refs)
1664
86
      ccv_array_free(tensor_blocks[i].r_refs);
1665
352
  }
1666
34
  ccfree(tensor_blocks);
1667
34
}
1668
1669
// Find tensors that cannot be solved by co-allocating to the same location.
1670
static int _ccv_nnc_exec_dep_and_tensor_blocks_find_hard_cases(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
1671
29
{
1672
29
  int i, j, nth = 0;
1673
309
  for (i = 0; 
i < symbolic_graph->tensor_symbol_info->rnum309
;
i++280
)
1674
280
    
if (280
!280
TENSOR_EXPECT_UNASSIGNED280
(tensor_blocks[i]) &&
tensor_symbol_info[i].assign_ref229
)
1675
11
    {
1676
11
      // This is is a parameter, thus, it has to be either an alias or used.
1677
11
      assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]));
1678
11
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
1679
11
      // The parameter it assign to has to be either an alias or used.
1680
11
      assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]));
1681
11
      // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
1682
11
      // If it is the same, we are good, no need to extend.
1683
11
      int a_ref = i;
1684
11
      while (tensor_blocks[a_ref].ref)
1685
0
        a_ref = tensor_blocks[a_ref].ref - 1;
1686
11
      int b_ref = assign_ref;
1687
15
      while (tensor_blocks[b_ref].ref)
1688
4
        b_ref = tensor_blocks[b_ref].ref - 1;
1689
11
      if (a_ref != b_ref)
1690
7
      {
1691
7
        // If any of the b's head is deterministically later than a's tail
1692
7
        // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
1693
7
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
1694
7
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
1695
7
        // It cannot be that both i can hop to j can j can hop to i.
1696
7
        assert(!(a_hop_b > 0 && b_hop_a > 0));
1697
7
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
1698
7
        if (
a_hop_b || 7
b_hop_a5
)
1699
2
        {
1700
2
          tensor_blocks[a_ref].companion_ref = b_ref + 1;
1701
2
          tensor_blocks[b_ref].companion_ref = a_ref + 1;
1702
2
          continue;
1703
2
        }
1704
5
        int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
1705
6
        for (j = 0; 
c_ref >= 06
;
j++1
)
1706
1
        {
1707
1
          while (tensor_blocks[c_ref].ref)
1708
0
            c_ref = tensor_blocks[c_ref].ref - 1;
1709
1
          c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
1710
1
        }
1711
5
        nth = ccv_max(nth, j + 1);
1712
5
      }
1713
11
    }
1714
29
  // Reset companion_ref if need to unroll.
1715
29
  if (nth)
1716
35
    
for (j = 0; 5
j < symbolic_graph->tensor_symbol_info->rnum35
;
j++30
)
1717
30
      tensor_blocks[j].companion_ref = 0;
1718
29
  return nth;
1719
29
}
1720
1721
static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n_times(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int n_times, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
1722
5
{
1723
5
  int i, j, n;
1724
5
  // The inout exec nodes, these are the nodes we are going to extend.
1725
5
  uint8_t* inout = (uint8_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
1726
5
  int max_input_size = 0;
1727
5
  int max_output_size = 0;
1728
17
  for (i = 0; 
i < symbolic_graph->exec_symbol_info->rnum17
;
i++12
)
1729
12
  {
1730
12
    max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size);
1731
12
    max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size);
1732
12
  }
1733
5
  ccv_nnc_tensor_symbol_t* max_inputs = max_input_size > 0 ? 
(ccv_nnc_tensor_symbol_t*)5
ccmalloc5
(sizeof(ccv_nnc_tensor_symbol_t) * max_input_size) :
00
;
1734
5
  ccv_nnc_tensor_symbol_t* max_outputs = max_output_size > 0 ? 
(ccv_nnc_tensor_symbol_t*)5
ccmalloc5
(sizeof(ccv_nnc_tensor_symbol_t) * max_output_size) :
00
;
1735
5
  // Doing graph expansion
1736
5
  // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
1737
5
  assert(dup_graph->exec_symbol_info->rnum > 0);
1738
5
  assert(dup_graph->tensor_symbol_info->rnum > 0);
1739
30
#define INCOMING_NODE (1)
1740
9
#define OUTGOING_NODE (2)
1741
5
  // Unroll the graph n times.
1742
11
  for (n = 0; 
n < n_times11
;
n++6
)
1743
6
  {
1744
6
    int* const dup_exec_ref = r_dup_exec_ref + n;
1745
5
    const int* const prev_dup_tensor_block_ref = n > 0 ? 
r_dup_tensor_block_ref + (n - 1)1
:
05
;
1746
6
    int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
1747
21
    for (i = 0; 
i < symbolic_graph->exec_symbol_info->rnum21
;
i++15
)
1748
15
      dup_exec_ref[i * n_times] = -1;
1749
45
    for (i = 0; 
i < symbolic_graph->tensor_symbol_info->rnum45
;
i++39
)
1750
39
    {
1751
39
      // If there is a assign_ref, that means I don't need to dup the tensor.
1752
39
      if (tensor_symbol_info[i].assign_ref)
1753
8
      {
1754
8
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1755
6
        dup_tensor_block_ref[i * n_times] = prev_dup_tensor_block_ref ? 
prev_dup_tensor_block_ref[assign_ref * n_times]2
:
assign_ref6
;
1756
31
      } else 
if (31
TENSOR_EXPECT_COMPUTABLE31
(tensor_blocks[i]) && 31
TENSOR_READ_WRITE19
(tensor_blocks[i]) == READ_ONLY19
)
1757
31
      // If this is a read-only tensor block, no need to duplicate because the value never changes
1758
31
      // (note we handled assign_ref first), therefore, no need to generate duplicate.
1759
10
        dup_tensor_block_ref[i * n_times] = i;
1760
31
      else
1761
21
        dup_tensor_block_ref[i * n_times] = -1;
1762
39
    }
1763
6
    // Go through the original graph, make copies of the node if it is inout.
1764
15
    
ccv_nnc_graph_visit_for15
(visit, exec_symbol_info, node, idx) {15
1765
15
      ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, n_times, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
1766
15
      inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
1767
15
      if (!node->outgoings)
1768
6
        break;
1769
18
      
for (i = 0; 9
i < node->outgoings->rnum18
;
i++9
)
1770
9
      {
1771
9
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
1772
9
        inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
1773
9
        ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, n_times, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
1774
9
        ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
1775
9
      }
1776
9
    } ccv_nnc_graph_visit_endfor
1777
6
    // Check the visitor are all marked as either incoming or outgoing.
1778
6
    const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
1779
6
    const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
1780
21
    for (i = 0; 
i < symbolic_graph->exec_symbol_info->rnum21
;
i++15
)
1781
15
    {
1782
15
      assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE));
1783
15
      // If this is pure incoming nodes, then I need to concat this one with all original destination node
1784
15
      if (
inout[i] == 15
INCOMING_NODE15
)
1785
12
        
for (j = 0; 6
j < dup_destination_size12
;
j++6
)
1786
6
        {
1787
6
          ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
1788
6
            .d = dup_destinations[j].d,
1789
6
            .graph = dup_graph,
1790
6
          }, (ccv_nnc_graph_exec_symbol_t) {
1791
6
            .d = dup_exec_ref[i * n_times],
1792
6
            .graph = dup_graph,
1793
6
          });
1794
6
        }
1795
15
    }
1796
6
    if (dup_graph->destinations)
1797
6
      ccv_array_clear(dup_graph->destinations);
1798
21
    for (i = 0; 
i < symbolic_graph->exec_symbol_info->rnum21
;
i++15
)
1799
15
    {
1800
15
      const int d = dup_exec_ref[i * n_times];
1801
15
      ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d);
1802
15
      // If this has no outgoing node, add to the destination.
1803
15
      if (
!exec_symbol_info->outgoings || 15
exec_symbol_info->outgoings->rnum == 09
)
1804
6
        ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
1805
6
          .graph = dup_graph,
1806
6
          .d = d,
1807
6
        });
1808
15
    }
1809
6
  }
1810
5
#undef INCOMING_NODE
1811
5
#undef OUTGOING_NODE
1812
5
  ccfree(inout);
1813
5
  ccfree(max_inputs);
1814
5
  ccfree(max_outputs);
1815
5
}
1816
1817
static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_nth_unroll, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
1818
29
{
1819
29
  int i, j;
1820
29
  ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
1821
29
  ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
1822
29
  // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
1823
29
  // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
1824
29
  // No need to change anything, we are good.
1825
29
  const int nth_unroll = _ccv_nnc_exec_dep_and_tensor_blocks_find_hard_cases(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
1826
29
  if (!nth_unroll)
1827
24
    return;
1828
29
  // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
1829
29
  // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
1830
5
  ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
1831
5
  int* dup_exec_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * nth_unroll);
1832
5
  int* dup_tensor_block_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * nth_unroll);
1833
5
  _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n_times(symbolic_graph, visit, nth_unroll, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
1834
5
  ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
1835
5
  ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
1836
5
  ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum);
1837
5
  ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get5
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get5
(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
1838
5
  // Free out the old exec_dep
1839
5
  ccv_matrix_free(exec_dep);
1840
5
  // and the tensor blocks, prepare for the new.
1841
5
  _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
1842
5
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, dup_visit, tensor_binds, tensor_bind_size, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get5
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get5
(dup_graph->destinations, 0), dup_graph->destinations->rnum, dup_exec_symbol_info, dup_tensor_symbol_info, &exec_dep, &tensor_blocks);
1843
5
  ccv_nnc_graph_visit_free(dup_visit);
1844
5
  ccfree(dup_exec_symbol_info);
1845
5
  ccfree(dup_tensor_symbol_info);
1846
5
  // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
1847
35
  for (i = 0; 
i < symbolic_graph->tensor_symbol_info->rnum35
;
i++30
)
1848
5
    // Loop over all possible duplications to assign dup_p_ref properly.
1849
69
    
for (j = 0; 30
j < nth_unroll69
;
j++39
)
1850
39
    {
1851
39
      const int dup_idx = dup_tensor_block_ref[j + i * nth_unroll];
1852
39
      if (
dup_idx >= 0 && 39
(tensor_blocks[i].p_refs[0] || 36
tensor_blocks[i].p_refs[1]29
))
1853
7
      {
1854
7
        const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
1855
7
        const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->p->exec_symbol_info, symbolic_graph->exec_idx - 1));
1856
7
        if (p_ref_0_is_in_or_out == 1)
1857
2
          tensor_blocks[dup_idx].dup_p_ref = p_ref_0 + 1;
1858
7
        if (
p_ref_0_is_in_or_out == 1 || 7
tensor_blocks[i].p_refs[1] == 05
)
1859
7
          continue;
1860
0
        const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
1861
0
        const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->p->exec_symbol_info, symbolic_graph->exec_idx - 1));
1862
0
        if (p_ref_1_is_in_or_out == 1)
1863
0
          tensor_blocks[dup_idx].dup_p_ref = p_ref_1 + 1;
1864
0
      }
1865
39
    }
1866
5
  // companion_ref
1867
35
  for (i = 0; 
i < symbolic_graph->tensor_symbol_info->rnum35
;
i++30
)
1868
5
    // Now can assign them (The dup) as companion.
1869
30
    
if (30
!30
TENSOR_EXPECT_UNASSIGNED30
(tensor_blocks[i]) &&
tensor_symbol_info[i].assign_ref30
)
1870
6
    {
1871
6
      // Get to the last one, which we will wrap over.
1872
6
      const int assign_ref = dup_tensor_block_ref[(tensor_symbol_info[i].assign_ref - 1) * nth_unroll + nth_unroll - 1];
1873
6
      if (assign_ref >= 0)
1874
6
      {
1875
6
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[assign_ref]);
1876
6
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[assign_ref], tensor_blocks[i]);
1877
6
        // It cannot be that both i can hop to j can j can hop to i.
1878
6
        // And it can be hop from one to another now after duplication.
1879
6
        assert(a_hop_b > 0 || b_hop_a > 0);
1880
6
        tensor_blocks[i].companion_ref = assign_ref + 1;
1881
6
        tensor_blocks[assign_ref].companion_ref = i + 1;
1882
6
      }
1883
6
    }
1884
5
  // Extend the dup tensor block ref, prepare for future extensions.
1885
5
  dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * nth_unroll);
1886
33
  for (i = symbolic_graph->tensor_symbol_info->rnum * nth_unroll; 
i < dup_graph->tensor_symbol_info->rnum * nth_unroll33
;
i++28
)
1887
28
    dup_tensor_block_ref[i] = -1;
1888
5
  // Assign out changed properties.
1889
5
  *r_exec_dep = exec_dep;
1890
5
  *r_tensor_blocks = tensor_blocks;
1891
5
  *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
1892
5
  *r_dup_graph = dup_graph;
1893
5
  *r_nth_unroll = nth_unroll;
1894
5
  *r_dup_exec_ref = dup_exec_ref;
1895
5
  *r_dup_tensor_block_ref = dup_tensor_block_ref;
1896
5
}
1897
1898
// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
1899
static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
1900
29
{
1901
29
  assert(source_size > 0);
1902
29
  assert(destination_size > 0);
1903
29
  // First, fill all the "auto" holes.
1904
29
  // This is the symbol table that with "auto" info filled up.
1905
29
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
1906
29
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
1907
29
  ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size);
1908
29
  ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
1909
29
  int i, j, k;
1910
29
  ccv_sparse_matrix_t* exec_dep;
1911
29
  ccv_nnc_tensor_block_t* tensor_blocks;
1912
29
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, exec_symbol_info, tensor_symbol_info, &exec_dep, &tensor_blocks);
1913
29
  int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
1914
29
  // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
1915
29
  // are automatically filled in, and all the sub-graphs are processed.
1916
29
  // There is a last step though, for a while loop, it is parameterized:
1917
29
  // while (x > 5) {
1918
29
  //     y = x + 1;
1919
29
  // } (y => x) // This means after this loop is done, y's value will be copied over to x.
1920
29
  // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
1921
29
  // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
1922
29
  // it is a inplace operation.
1923
29
  // But if y cannot be x's alias, for example, this while loop looks like this:
1924
29
  // while (x > 5) {
1925
29
  //     y = x + a
1926
29
  //     b = x + y
1927
29
  // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
1928
29
  // For this example, y cannot be x's alias because x is used later to compute b (and that computation
1929
29
  // has dependency on y as well).
1930
29
  // For this case, we need to modify the computation graph. Previously, the graph looks like this:
1931
29
  // y = x + a -> b = x + y
1932
29
  // This graph will be extended to look like this:
1933
29
  // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
1934
29
  // while (x0 > 5) {
1935
29
  //     y0 = x0 + a0
1936
29
  //     b0 = x0 + y0
1937
29
  //     if (y0 > 5) break
1938
29
  //     y1 = y0 + b0
1939
29
  //     b1 = y0 + y1
1940
29
  // } (y1 => x0, b1 => a0)
1941
29
  // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
1942
29
  // with each other now).
1943
29
  // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
1944
29
  // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
1945
29
  ccv_nnc_symbolic_graph_t* dup_graph = 0;
1946
29
  int* dup_exec_ref = 0;
1947
29
  int* dup_tensor_block_ref = 0;
1948
29
  int nth_unroll = 0;
1949
29
  // Cannot handle dup a node that is a graph as well.
1950
29
  _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &nth_unroll, &dup_exec_ref, &dup_tensor_block_ref);
1951
29
  // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
1952
29
  ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
1953
29
  prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
1954
29
  // TODO: I should be able to express that you cannot fold / or reuse certain tensor blocks.
1955
19
  ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && 
symbolic_graph->sub_graphs->rnum10
?
(ccv_nnc_symbolic_graph_prep_t**)10
cccalloc10
(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) :
019
;
1956
135
  
ccv_nnc_graph_visit_for135
(visit, exec_symbol_info, node, idx) {135
1957
135
    if (node->graph_ref)
1958
10
    {
1959
10
      ccv_nnc_symbolic_graph_t* while_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, node->graph_ref - 1);
1960
10
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(while_graph, tensor_binds, tensor_bind_size, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get10
(while_graph->sources, 0), while_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get10
(while_graph->destinations, 0), while_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
1961
10
      sub_prep->p = prep;
1962
10
      sub_preps[node->graph_ref - 1] = sub_prep;
1963
10
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
1964
10
      const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
1965
75
      for (i = 0; 
i < s_alloc_prep->block_size75
;
i++65
)
1966
65
      {
1967
65
        const int block_ref = s_alloc_prep->blocks[i].block_ref;
1968
65
        const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
1969
65
        if (block_ref < sub_prep->tensor_symbol_info_size)
1970
43
        {
1971
43
          if (s_tensor_blocks[block_ref].p_refs[0])
1972
15
          {
1973
15
            /* If it is already properly assigned, next. */
1974
15
            if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
1975
15
              s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
1976
15
            {
1977
15
              if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
1978
15
                s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
1979
0
              else {
1980
0
                assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
1981
0
                s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
1982
0
              }
1983
15
            }
1984
15
            /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
1985
15
            if (s_tensor_blocks[block_ref].p_refs[1] &&
1986
3
              s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
1987
3
              s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
1988
3
            {
1989
3
              assert(s_alloc_prep->buffers[buffer_ref].p_refs[0]);
1990
3
              assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
1991
3
              s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
1992
3
            }
1993
15
          }
1994
22
        } else 
if (22
s_tensor_blocks[block_ref].dup_p_ref22
)
{2
1995
2
          /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous bloc
1996
2
           * which by default only has life-cycle shared with this sub-graph node. The reason to extend is th
1997
2
           * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_r
1998
2
           * always points to an output tensor of this sub-graph node) therefore, the memory region must exte
1999
2
           * its life-time to the end of the output tensor. */
2000
2
          if (!s_alloc_prep->buffers[buffer_ref].dup_p_ref)
2001
2
            s_alloc_prep->buffers[buffer_ref].dup_p_ref = s_tensor_blocks[block_ref].dup_p_ref;
2002
2
          assert(s_alloc_prep->buffers[buffer_ref].dup_p_ref == s_tensor_blocks[block_ref].dup_p_ref);
2003
2
        }
2004
65
      }
2005
10
      int anonymous_buffer_size = 0;
2006
46
      for (i = 0; 
i < s_alloc_prep->buffer_size46
;
i++36
)
2007
36
        
if (36
s_alloc_prep->buffers[i].p_refs[0]36
)
2008
15
        {
2009
15
          /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
2010
15
          int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
2011
15
          /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
2012
15
          int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
2013
15
          assert(p_ref_0_is_in_or_out != 0);
2014
15
          while (tensor_blocks[p_ref_0].ref)
2015
0
            p_ref_0 = tensor_blocks[p_ref_0].ref - 1;
2016
15
          int folded = 0;
2017
15
          /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
2018
15
          assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[p_ref_0]));
2019
15
          if (s_alloc_prep->buffers[i].p_refs[1])
2020
3
          {
2021
3
            int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
2022
3
            const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
2023
3
            assert(p_ref_1_is_in_or_out != 0);
2024
3
            while (tensor_blocks[p_ref_1].ref)
2025
0
              p_ref_1 = tensor_blocks[p_ref_1].ref - 1;
2026
3
            /* See above comment for the similar p_ref_0 check. */
2027
3
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[p_ref_1]));
2028
3
            assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out);
2029
3
            int p_ref_t;
2030
3
            if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
2031
3
              CCV_SWAP(p_ref_0, p_ref_1, p_ref_t);
2032
3
            p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
2033
3
            /* If the dimension matches, can fold. */
2034
3
            if (
memcmp(tensor_symbol_info[p_ref_1].info.dim, tensor_symbol_info[p_ref_0].info.dim, sizeof(int) * 3
CCV_NNC_MAX_DIM_ALLOC3
) == 0)
2035
3
            {
2036
3
              folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, p_ref_1, p_ref_0);
2037
3
              if (folded)
2038
1
              {
2039
1
                p_ref_0 = p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
2040
1
                for (j = 0; 
j < nth_unroll1
;
j++0
) /* Fold its duplicates as well. */
2041
0
                {
2042
0
                  const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[p_ref_1 * nth_unroll + j], dup_tensor_block_ref[p_ref_0 * nth_unroll + j]);
2043
0
                  assert(folded && "the subsequent duplicates can be folded too.");
2044
0
                }
2045
1
              }
2046
3
            }
2047
3
          }
2048
15
          /* Only proceed if it is folded (thus, the input / output tensor can be connected, reuse is not a problem
2049
15
           * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control ov
2050
15
           * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
2051
15
           * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the conte
2052
15
           * within its memory region)). */
2053
15
          if (folded ||
2054
14
            
(p_ref_0_is_in_or_out == 1 && 14
_ccv_nnc_tensor_block_check_head(tensor_blocks + p_ref_0, idx)5
) ||
2055
9
            
(p_ref_0_is_in_or_out == -1 && 9
_ccv_nnc_tensor_block_check_tail(tensor_blocks + p_ref_0, idx)9
))
2056
13
          {
2057
13
            /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
2058
13
             * is a long argument why that is the case, the digest is, it is much easier to control your output
2059
13
             * than your input). */
2060
13
            s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
2061
13
            s_alloc_prep->buffers[i].p_refs[1] = 0;
2062
13
            /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
2063
13
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[p_ref_0]));
2064
13
            tensor_blocks[p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[p_ref_0].size);
2065
14
            for (j = 0; 
j < nth_unroll14
;
j++1
) /* Change the size of its duplicates as well. */
2066
1
              tensor_blocks[dup_tensor_block_ref[p_ref_0 * nth_unroll + j]].size = tensor_blocks[p_ref_0].size;
2067
2
          } else {
2068
2
            s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
2069
2
            ++anonymous_buffer_size;
2070
2
          }
2071
15
        } else
2072
21
          ++anonymous_buffer_size;
2073
10
      if (anonymous_buffer_size)
2074
6
      {
2075
6
        /* Anonymous block, allocate additional tensor blocks for this. */
2076
6
        /* This is either because this is an internal tensor (don't have p_ref) */
2077
6
        /* or it is an anonymous block itself within the sub graphs of this while graph. */
2078
6
        tensor_blocks = (ccv_nnc_tensor_block_t*)ccrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (tensor_block_size + (nth_unroll + 1) * anonymous_buffer_size));
2079
6
        memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (nth_unroll + 1) * anonymous_buffer_size);
2080
6
        if (dup_tensor_block_ref)
2081
1
          
dup_tensor_block_ref = (int*)1
ccrealloc1
(dup_tensor_block_ref, sizeof(int) * nth_unroll * (tensor_block_size + anonymous_buffer_size));
2082
34
        for (i = 0; 
i < s_alloc_prep->buffer_size34
;
i++28
)
2083
28
          
if (28
!s_alloc_prep->buffers[i].p_refs[0]28
)
2084
23
          {
2085
23
            TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size]);
2086
23
            TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
2087
23
            tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
2088
23
            tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
2089
23
            s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
2090
23
            tensor_blocks[tensor_block_size].graph_ref = node->graph_ref;
2091
23
            tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
2092
23
            ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
2093
23
            const int dup_p_ref = s_alloc_prep->buffers[i].dup_p_ref - 1;
2094
23
            if (dup_p_ref >= 0)
2095
2
            {
2096
2
              assert(tensor_blocks[dup_p_ref].tail);
2097
2
              tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
2098
2
              memcpy(
ccv_array_get2
(tensor_blocks[tensor_block_size].tail, 0),
ccv_array_get2
(tensor_blocks[dup_p_ref].tail, 0), sizeof(int) * tensor_blocks[dup_p_ref].tail->rnum);
2099
2
              tensor_blocks[tensor_block_size].tail->rnum = tensor_blocks[dup_p_ref].tail->rnum;
2100
21
            } else {
2101
21
              tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
2102
21
              ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
2103
21
            }
2104
23
            if (
TENSOR_READ_WRITE23
(s_alloc_prep->buffers[i]) == READ_ONLY23
) /* If it is read-only, add all sources (destinations) to it. */
2105
16
            {
2106
32
              for (j = 0; 
j < source_size32
;
j++16
)
2107
16
                _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
2108
16
              /* If this is a read-only (based on SSA, if first encountered as read), and this is
2109
16
               * sub-graph. Mark it to the end of the graph. */
2110
16
              if (symbolic_graph->p)
2111
4
                
for (j = 0; 2
j < destination_size4
;
j++2
)
2112
2
                  _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
2113
16
            }
2114
23
            ++tensor_block_size;
2115
23
            /* ref and flags are both 0. */
2116
23
            if (
TENSOR_READ_WRITE23
(s_alloc_prep->buffers[i]) == READ_ONLY23
) /* If it is read-only, it is self-reflecting. */
2117
16
            {
2118
18
              for (k = 0; 
k < nth_unroll18
;
k++2
)
2119
2
              {
2120
4
                for (j = 0; 
j < destination_size4
;
j++2
)
2121
2
                  
if (2
dup_exec_ref[destinations[j].d * nth_unroll + k] >= 02
)
2122
2
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * nth_unroll + k], tensor_blocks[tensor_block_size - 1]);
2123
2
                /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
2124
2
                assert(symbolic_graph->p);
2125
2
                dup_tensor_block_ref[(tensor_block_size - 1) * nth_unroll + k] = tensor_block_size - 1;
2126
2
              }
2127
7
            } else {
2128
7
              const int prev_tensor_block_idx = tensor_block_size - 1;
2129
8
              for (k = 0; 
k < nth_unroll8
;
k++1
)
2130
1
              {
2131
1
                dup_tensor_block_ref[prev_tensor_block_idx * nth_unroll + k] = tensor_block_size;
2132
1
                TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size]);
2133
1
                TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
2134
1
                tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
2135
1
                tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
2136
1
                tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
2137
1
                /* Attach to duplicated exec for this tensor block. */
2138
1
                ccv_array_push(tensor_blocks[tensor_block_size].head, &dup_exec_ref[idx * nth_unroll + k]);
2139
1
                if (dup_p_ref >= 0)
2140
1
                {
2141
1
                  /* Not nil, not self-reflecting. */
2142
1
                  assert(dup_tensor_block_ref[dup_p_ref * nth_unroll + k] >= 0 && dup_tensor_block_ref[dup_p_ref * nth_unroll + k] != dup_p_ref);
2143
1
                  const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * nth_unroll + k];
2144
1
                  assert(tensor_blocks[dup_dup_p_ref].tail);
2145
1
                  tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
2146
1
                  memcpy(
ccv_array_get1
(tensor_blocks[tensor_block_size].tail, 0),
ccv_array_get1
(tensor_blocks[dup_dup_p_ref].tail, 0), sizeof(int) * tensor_blocks[dup_dup_p_ref].tail->rnum);
2147
1
                  tensor_blocks[tensor_block_size].tail->rnum = tensor_blocks[dup_dup_p_ref].tail->rnum;
2148
0
                } else {
2149
0
                  tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
2150
0
                  ccv_array_push(tensor_blocks[tensor_block_size].tail, &dup_exec_ref[idx * nth_unroll + k]);
2151
0
                }
2152
1
                ++tensor_block_size;
2153
1
              }
2154
7
            }
2155
23
          }
2156
6
      }
2157
10
    }
2158
135
  } ccv_nnc_graph_visit_endfor
2159
29
  // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
2160
29
  // the allocation dependencies, thus, which tensor is reused to the existing tensor.
2161
29
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new(exec_dep, tensor_blocks, tensor_block_size);
2162
29
  ccv_matrix_free(exec_dep);
2163
29
  prep->p = 0;
2164
29
  prep->graph_ref = (intptr_t)symbolic_graph;
2165
29
  prep->p_idx = symbolic_graph->p_idx;
2166
29
  prep->exec_idx = symbolic_graph->exec_idx;
2167
19
  prep->sub_prep_size = symbolic_graph->sub_graphs ? 
symbolic_graph->sub_graphs->rnum10
:
019
;
2168
29
  prep->sub_preps = sub_preps;
2169
29
  prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
2170
29
  prep->exec_symbol_info = exec_symbol_info;
2171
29
  prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
2172
29
  prep->tensor_symbol_info = tensor_symbol_info;
2173
29
  prep->nth_unroll = nth_unroll;
2174
29
  prep->dup_tensor_block_ref = dup_tensor_block_ref;
2175
29
  prep->tensor_block_size = tensor_block_size;
2176
29
  prep->tensor_blocks = tensor_blocks;
2177
29
  prep->visit = visit;
2178
29
  prep->alloc_prep = alloc_prep;
2179
29
  if (dup_graph)
2180
5
    ccv_nnc_symbolic_graph_free(dup_graph);
2181
29
  if (dup_exec_ref)
2182
5
    
ccfree5
(dup_exec_ref)5
;
2183
29
  return prep;
2184
29
}
2185
2186
static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* prep)
2187
29
{
2188
29
  int i;
2189
29
  _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
2190
40
  for (i = 0; 
i < prep->sub_prep_size40
;
i++11
)
2191
11
    
if (11
prep->sub_preps[i]11
)
2192
10
      _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
2193
29
  if (prep->sub_preps)
2194
10
    
ccfree10
(prep->sub_preps)10
;
2195
29
  ccfree(prep->tensor_symbol_info);
2196
29
  ccfree(prep->exec_symbol_info);
2197
29
  if (prep->dup_tensor_block_ref)
2198
5
    
ccfree5
(prep->dup_tensor_block_ref)5
;
2199
29
  _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
2200
29
  ccv_nnc_graph_visit_free(prep->visit);
2201
29
  ccfree(prep);
2202
29
}
2203
2204
static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
2205
29
{
2206
29
  int i, j, k;
2207
29
  ccv_nnc_graph_t* const graph = graph_prep->graph;
2208
29
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (symbolic_graph->exec_symbol_info->rnum - 1));
2209
29
  graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
2210
29
  graph_exec_arena->graph_exec_size = symbolic_graph->exec_symbol_info->rnum;
2211
29
  graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
2212
29
  graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + symbolic_graph->exec_symbol_info->rnum);
2213
29
  memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
2214
29
  ccv_nnc_graph_exec_t* graph_execs = graph_exec_arena->graph_execs;
2215
29
  int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
2216
165
  for (i = 0; 
i < symbolic_graph->exec_symbol_info->rnum165
;
i++136
)
2217
136
  {
2218
136
    max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size);
2219
136
    max_output_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].output_size);
2220
136
    graph_execs[i].graph = 0;
2221
136
  }
2222
40
  for (i = 0; 
i < graph_prep->sub_prep_size40
;
i++11
)
2223
11
    
max_breakpoint_size = 11
ccv_max11
(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size);
2224
28
  ccv_nnc_tensor_t** max_inputs = max_input_size > 0 ? 
(ccv_nnc_tensor_t**)28
ccmalloc28
(sizeof(ccv_nnc_tensor_t*) * max_input_size) :
01
;
2225
28
  ccv_nnc_tensor_t** max_outputs = max_output_size > 0 ? 
(ccv_nnc_tensor_t**)28
ccmalloc28
(sizeof(ccv_nnc_tensor_t*) * max_output_size) :
01
;
2226
19
  ccv_nnc_graph_exec_t* max_breakpoints = max_breakpoint_size > 0 ? 
(ccv_nnc_graph_exec_t*)10
ccmalloc10
(sizeof(ccv_nnc_graph_exec_t) * max_breakpoint_size) :
019
;
2227
29
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
2228
135
  
ccv_nnc_graph_visit_for135
(graph_prep->visit, exec_symbol_info, node, idx) {135
2229
135
    if (CCV_NO_GRAPH_EXEC(graph_execs[idx]))
2230
33
    {
2231
73
      for (i = 0; 
i < node->input_size73
;
i++40
)
2232
40
        
max_inputs[i] = node->inputs[i] >= 0 ? 40
tensor_arena->vt_tensors[node->inputs[i]]40
:
00
;
2233
52
      for (i = 0; 
i < node->output_size52
;
i++19
)
2234
19
        
max_outputs[i] = node->outputs[i] >= 0 ? 19
tensor_arena->vt_tensors[node->outputs[i]]19
:
00
;
2235
33
      if (node->graph_ref)
2236
9
      {
2237
9
        const int graph_ref = node->graph_ref - 1;
2238
9
        ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
2239
9
        graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
2240
9
        const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
2241
9
        const ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
2242
18
        for (i = 0; 
i < sub_symbolic_graph->breakpoint_size18
;
i++9
)
2243
9
          max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
2244
9
        ccv_nnc_graph_exec_t source = ccv_nnc_graph_exec_source(sub_arena);
2245
9
        ccv_nnc_graph_exec_t destination = ccv_nnc_graph_exec_destination(sub_arena);
2246
9
        ccv_nnc_graph_set_sources(sub_graph, &source, 1);
2247
9
        ccv_nnc_graph_set_destinations(sub_graph, &destination, 1);
2248
9
        ccv_nnc_graph_set_while_expr(sub_graph, sub_symbolic_graph->while_expr, sub_symbolic_graph->while_data, max_breakpoints, sub_symbolic_graph->breakpoint_size);
2249
9
        ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
2250
24
      } else {
2251
24
        graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
2252
24
      }
2253
33
    }
2254
135
    if (!node->outgoings)
2255
29
      break;
2256
246
    
for (i = 0; 106
i < node->outgoings->rnum246
;
i++140
)
2257
140
    {
2258
140
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
2259
140
      if (CCV_NO_GRAPH_EXEC(graph_execs[outgoing]))
2260
102
      {
2261
102
        const ccv_nnc_graph_exec_symbol_info_t* const outgoing_node = exec_symbol_info + outgoing;
2262
347
        for (j = 0; 
j < outgoing_node->input_size347
;
j++245
)
2263
245
          
max_inputs[j] = outgoing_node->inputs[j] >= 0 ? 245
tensor_arena->vt_tensors[outgoing_node->inputs[j]]212
:
033
;
2264
223
        for (j = 0; 
j < outgoing_node->output_size223
;
j++121
)
2265
121
          
max_outputs[j] = outgoing_node->outputs[j] >= 0 ? 121
tensor_arena->vt_tensors[outgoing_node->outputs[j]]118
:
03
;
2266
102
        if (outgoing_node->graph_ref)
2267
1
        {
2268
1
          const int graph_ref = outgoing_node->graph_ref - 1;
2269
1
          ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
2270
1
          graph_execs[outgoing] = ccv_nnc_graph_while(graph, outgoing_node->cmd.cmd, sub_graph);
2271
1
          const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
2272
1
          const ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
2273
2
          for (j = 0; 
j < sub_symbolic_graph->breakpoint_size2
;
j++1
)
2274
1
            max_breakpoints[j] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[j]);
2275
1
          ccv_nnc_graph_exec_t source = ccv_nnc_graph_exec_source(sub_arena);
2276
1
          ccv_nnc_graph_exec_t destination = ccv_nnc_graph_exec_destination(sub_arena);
2277
1
          ccv_nnc_graph_set_sources(sub_graph, &source, 1);
2278
1
          ccv_nnc_graph_set_destinations(sub_graph, &destination, 1);
2279
1
          ccv_nnc_graph_set_while_expr(sub_graph, sub_symbolic_graph->while_expr, sub_symbolic_graph->while_data, max_breakpoints, sub_symbolic_graph->breakpoint_size);
2280
1
          ccv_nnc_graph_exec_set_io(graph, graph_execs[outgoing], max_inputs, outgoing_node->input_size, max_outputs, outgoing_node->output_size);
2281
101
        } else {
2282
101
          graph_execs[outgoing] = ccv_nnc_graph_exec_new(graph, outgoing_node->cmd, outgoing_node->hint, max_inputs, outgoing_node->input_size, max_outputs, outgoing_node->output_size);
2283
101
        }
2284
102
      }
2285
140
      ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
2286
140
    }
2287
106
  } ccv_nnc_graph_visit_endfor
2288
29
  if (max_inputs)
2289
28
    
ccfree28
(max_inputs)28
;
2290
29
  if (max_outputs)
2291
28
    
ccfree28
(max_outputs)28
;
2292
29
  if (max_breakpoints)
2293
10
    
ccfree10
(max_breakpoints)10
;
2294
29
  int source_exec_created = 0;
2295
29
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
2296
29
  const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
2297
29
  ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
2298
29
  // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
2299
309
  for (i = 0; 
i < symbolic_graph->tensor_symbol_info->rnum309
;
i++280
)
2300
280
  {
2301
280
    if (tensor_symbol_info[i].flags & CCV_NNC_SYM_TENSOR_INIT_ZEROS)
2302
6
    {
2303
6
      int ref = i;
2304
6
      while (tensor_symbol_info[ref].alias_ref)
2305
0
        ref = tensor_symbol_info[ref].alias_ref - 1;
2306
6
      while (
!6
TENSOR_EXPECT_COMPUTABLE6
(tensor_blocks[ref]) &&
tensor_blocks[ref].ref0
)
2307
0
        ref = tensor_blocks[ref].ref - 1;
2308
6
      // This is not computable. It could be that we marked a const tensor as init zero.
2309
6
      if (
!6
TENSOR_EXPECT_COMPUTABLE6
(tensor_blocks[ref]))
2310
0
        continue;
2311
6
      // If this tensor is not used by any exec, we don't need to init at all. Skip.
2312
6
      
if (6
!tensor_blocks[ref].head || 6
tensor_blocks[ref].head->rnum == 06
)
2313
0
        continue;
2314
6
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
2315
6
      // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
2316
6
      ccv_nnc_graph_exec_t set_exec = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
2317
12
      for (j = 0; 
j < tensor_blocks[ref].head->rnum12
;
j++6
)
2318
6
      {
2319
6
        const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j);
2320
6
        assert(graph_execs[outgoing].graph);
2321
6
        ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
2322
6
      }
2323
6
      int flags = 0;
2324
6
      if (alloc_dep[ref])
2325
2
        
for (j = 0; 1
j < alloc_dep[ref]->rnum2
;
j++1
)
2326
1
        {
2327
1
          const int d = *(int*)ccv_array_get(alloc_dep[ref], j);
2328
1
          // This is from alloc_dep, it should be computable.
2329
1
          assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2330
1
          if (tensor_blocks[d].tail)
2331
2
            
for (k = 0; 1
k < tensor_blocks[d].tail->rnum2
;
k++1
)
2332
1
            {
2333
1
              const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, j);
2334
1
              assert(graph_execs[incoming].graph);
2335
1
              ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
2336
1
              flags = 1;
2337
1
            }
2338
1
        }
2339
6
      // If cannot find a start node for this exec, we need to append it to the no-op of the start.
2340
6
      if (!flags)
2341
5
      {
2342
5
        if (!source_exec_created)
2343
2
        {
2344
2
          graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
2345
2
          source_exec_created = 1;
2346
2
        }
2347
5
        ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
2348
5
      }
2349
6
    }
2350
280
  }
2351
29
  // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
2352
29
  // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
2353
29
  // with its alias).
2354
29
  assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size);
2355
309
  for (i = 0; 
i < tensor_arena->vt_tensor_size309
;
i++280
)
2356
280
  {
2357
280
    ccv_nnc_tensor_t* const mv = tensor_arena->vt_tensors[i];
2358
280
    // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
2359
280
    if (
mv && 280
CCV_IS_TENSOR_MULTIVIEW278
(mv))
2360
14
    {
2361
14
      const ccv_array_t* const head = tensor_blocks[i].head;
2362
14
      if (
head && 14
head->rnum > 014
)
2363
28
        
for (j = 0; 14
j < head->rnum28
;
j++14
)
2364
14
        {
2365
14
          const int idx = *(int*)ccv_array_get(head, j);
2366
14
          const int d = graph_execs[idx].d;
2367
14
          ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d);
2368
14
          int flag = 0;
2369
31
          for (k = 0; 
k < exec_info->tensor_nest_size && 31
!flag17
;
k++17
)
2370
17
            
flag = (exec_info->tensor_nests[k] && 17
exec_info->tensor_nests[k]->tensors[0] == mv9
);
2371
14
          // If non is in the flag, it need to be included in the cast.
2372
14
          if (!flag)
2373
10
            ccv_nnc_graph_exec_add_broadcast(graph, graph_execs[idx], mv);
2374
14
        }
2375
14
    }
2376
280
  }
2377
29
  // Create source / destination phony node. This is to facilitate use of compiled graph.
2378
29
  // Also, this is needed if you have init zero execs.
2379
29
  if (
source_exec_created || 29
source_size > 127
)
2380
5
  {
2381
5
    if (!source_exec_created)
2382
3
      
graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, 3
CMD_GENERIC3
(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
2383
14
    for (i = 0; 
i < source_size14
;
i++9
)
2384
9
      ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
2385
24
  } else {
2386
24
    assert(!source_exec_created);
2387
24
    assert(source_size == 1);
2388
24
    graph_exec_arena->source = graph_execs[sources[0].d];
2389
24
  }
2390
29
  if (destination_size == 1)
2391
23
    graph_exec_arena->destination = graph_execs[destinations[0].d];
2392
6
  else {
2393
6
    graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
2394
19
    for (i = 0; 
i < destination_size19
;
i++13
)
2395
13
      ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
2396
6
  }
2397
29
  return graph_exec_arena;
2398
29
}
2399
2400
void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
2401
19
{
2402
19
  assert(graph_ref);
2403
19
  assert(tensor_arena_ref);
2404
19
  assert(graph_exec_arena_ref);
2405
19
  int i;
2406
19
  // Cannot bind the multi-view.
2407
21
  for (i = 0; 
i < tensor_bind_size21
;
i++2
)
2408
2
    
if (2
tensor_binds[i].tensor2
)
2409
2
    { assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)); }
2410
19
  ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
2411
19
  ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, 0, tensor_binds, tensor_bind_size);
2412
19
  *tensor_arena_ref = tensor_arena;
2413
19
  // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
2414
19
  *graph_ref = graph_prep->graph;
2415
19
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
2416
19
  *graph_exec_arena_ref = graph_exec_arena;
2417
19
  _ccv_nnc_symbolic_graph_prep_free(graph_prep);
2418
19
}
2419
2420
static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
2421
29
{
2422
29
  // Buffers are inherited from above, no need to dealloc.
2423
29
  int i;
2424
40
  for (i = 0; 
i < tensor_arena->sub_arena_size40
;
i++11
)
2425
11
    
if (11
tensor_arena->sub_arenas[i]11
)
2426
10
      _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
2427
257
  for (i = 0; 
i < tensor_arena->m_tensor_size257
;
i++228
)
2428
228
  {
2429
228
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor_arena->m_tensors[i];
2430
228
    if (
mv && 228
CCV_IS_TENSOR_MULTIVIEW228
(mv))
2431
14
      ccv_nnc_tensor_multiview_free(*mv);
2432
228
  }
2433
29
  ccv_array_free(tensor_arena->tensor_metadata);
2434
29
  ccfree(tensor_arena);
2435
29
}
2436
2437
void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
2438
19
{
2439
19
  int i;
2440
19
#ifdef HAVE_CUDA
2441
19
  if (tensor_arena->memory_type == CCV_TENSOR_GPU_MEMORY)
2442
0
  {
2443
0
    for (i = 0; 
i < tensor_arena->buffer_size0
;
i++0
)
2444
0
      cufree(tensor_arena->device_id, tensor_arena->buffers[i].ptr);
2445
19
  } else {
2446
19
    assert(tensor_arena->memory_type == CCV_TENSOR_CPU_MEMORY);
2447
143
    for (i = 0; 
i < tensor_arena->buffer_size143
;
i++124
)
2448
124
      
ccfree124
(tensor_arena->buffers[i].ptr)124
;
2449
19
  }
2450
19
#else
2451
  assert(tensor_arena->memory_type == CCV_TENSOR_CPU_MEMORY);
2452
  for (i = 0; i < tensor_arena->buffer_size; i++)
2453
    ccfree(tensor_arena->buffers[i].ptr);
2454
#endif
2455
19
  _ccv_nnc_tensor_arena_free(tensor_arena);
2456
19
}
2457
2458
void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2459
29
{
2460
29
  int i;
2461
40
  for (i = 0; 
i < graph_exec_arena->sub_arena_size40
;
i++11
)
2462
11
    
if (11
graph_exec_arena->sub_arenas[i]11
)
2463
10
      ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
2464
29
  ccfree(graph_exec_arena);
2465
29
}