Coverage Report

Created: 2025-04-03 22:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_symbolic_graph_compile.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#include "ccv_nnc_easy.h"
4
#include "ccv_internal.h"
5
#ifdef HAVE_CUDA
6
#include "gpu/ccv_nnc_compat.h"
7
#elif defined(HAVE_MPS)
8
#include "mps/ccv_nnc_mps.h"
9
#endif
10
#include "_ccv_nnc_graph.h"
11
#include "_ccv_nnc_symbolic_graph.h"
12
13
// MARK - Level-3 API
14
15
typedef struct {
16
  int flags;
17
  int type;
18
  int pin_mem; // This memory need to be pinned.
19
  int ref; // Reference to another tensor block. Start with 1.
20
  int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21
  int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22
  int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23
  int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24
  ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25
  uint64_t size; // The size of the tensor expected.
26
  int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27
  ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28
  ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29
  ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30
} ccv_nnc_tensor_block_t; // Tensor Arena Block
31
32
4.34M
#define IS_PRIMARY_COMPANION(idx, block) ((idx) < (uint32_t)((block).companion_ref - 1))
33
34
enum {
35
  UNASSIGNED = 0x1,
36
  ALIAS = 0x2,
37
  READ_ONLY = 0x4,
38
  WRITE_ONLY = 0x8,
39
  READ_WRITE = 0xc,
40
  ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41
  UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42
  UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43
};
44
45
#define TENSOR_EXPECT_ORDINARY(t) ((t.flags & 0x3) == 0)
46
#define TENSOR_EXPECT_SET_ORDINARY(t) (t.flags = (t.flags & ~0x3))
47
5.57M
#define TENSOR_EXPECT_UNASSIGNED(t) ((t.flags & 0x3) == UNASSIGNED)
48
6.40k
#define TENSOR_EXPECT_SET_UNASSIGNED(t) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49
3
#define TENSOR_EXPECT_UNSET_UNASSIGNED(t) (t.flags = (t.flags & ~0x1))
50
9.24M
#define TENSOR_EXPECT_ALIAS(t) ((t.flags & 0x3) == ALIAS)
51
8.59M
#define TENSOR_EXPECT_COMPUTABLE(t) (
!4.39M
TENSOR_EXPECT_ALIAS4.39M
(t) &&
!4.20M
TENSOR_EXPECT_UNASSIGNED4.20M
(t))
52
27.7k
#define TENSOR_READ_WRITE(t) (t.flags & 0xc)
53
6.48k
#define TENSOR_SET_READ_WRITE(t, rw) (t.flags = ((t.flags & ~0xc) | rw))
54
95
#define TENSOR_SET_ANONYMOUS(t) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55
#define TENSOR_IS_ANONYMOUS(t) (t.flags & ANONYMOUS)
56
180
#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57
19.7k
#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t) (t.flags & UNFOLDABLE_AS_INPUT)
58
116
#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59
13.3k
#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t) (t.flags & UNFOLDABLE_AS_OUTPUT)
60
61
118k
#define TENSOR_REQUIRE_INIT(flags) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || 
((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)118k
)
62
63
// Holds additional information about the exe nodes.
64
typedef struct {
65
  int flags;
66
} ccv_nnc_graph_exec_flag_t;
67
68
enum {
69
  CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70
};
71
72
typedef struct {
73
  int index;
74
  int oc;
75
  int type;
76
  uint64_t size;
77
} ccv_nnc_tensor_opt_t;
78
79
// We first sort the same type together (because they won't be reused at all.
80
// And then we sort by size, after that, sort by oc.
81
226k
#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82
226k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)
83
#undef more_than
84
typedef struct {
85
  int idx;
86
  int hop;
87
} ccv_nnc_tensor_hop_t;
88
225k
#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89
225k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)
90
#undef less_than
91
92
// If b has items overlap with a, a is still after b (inclusive).
93
static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94
0
{
95
0
  assert(a);
96
0
  assert(b);
97
0
  int x, y;
98
0
  for (x = 0; x < b->rnum; x++)
99
0
  {
100
0
    const int p = *(int*)ccv_array_get(b, x);
101
0
    int flag = 0;
102
    // In extreme cases where a is a superset of b, then a is still after b, we are good.
103
0
    for (y = 0; !flag && y < a->rnum; y++)
104
0
    {
105
0
      const int q = *(int*)ccv_array_get(a, y);
106
0
      flag = (p == q);
107
0
    }
108
0
    if (!flag)
109
0
      for (y = 0; y < a->rnum; y++)
110
0
      {
111
0
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y), p);
112
0
        if (!cell.i32 || cell.i32[0] == 0)
113
0
          return 0;
114
0
      }
115
0
  }
116
  // If b->rnum == 0, a is after b for sure.
117
  // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118
  // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119
0
  return (a->rnum > 0 || b->rnum == 0);
120
0
}
121
122
static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123
1.27M
{
124
1.27M
  assert(a);
125
1.27M
  assert(b);
126
1.27M
  int x, y, max_hop = 0;
127
1.34M
  for (x = 0; x < a->rnum; 
x++73.0k
)
128
1.34M
    
for (y = 0; 1.27M
y < b->rnum;
y++73.5k
)
129
1.27M
    {
130
1.27M
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x), *(int*)ccv_array_get(b, y));
131
1.27M
      if (!cell.i32 || 
cell.i32[0] == 073.5k
)
132
1.20M
        return 0;
133
73.5k
      max_hop = ccv_max(cell.i32[0], max_hop);
134
73.5k
    }
135
  // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
136
  // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
137
73.0k
  return max_hop;
138
1.27M
}
139
140
// If every a's head is deterministically after b's tail
141
static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
142
1.27M
{
143
1.27M
  return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
144
1.27M
}
145
146
typedef struct {
147
  ccv_array_t** alloc_dep;
148
  int vt_block_size;
149
  int buffer_size;
150
  int block_size;
151
  int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
152
  struct {
153
    int type; // The type from tensor blocks.
154
    int pin_mem; // Whether this is pinned memory.
155
    int flags; // The flags (currently for READ_ONLY or not).
156
    uint64_t size; // The size of the buffer allocated.
157
    int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
158
    ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
159
  }* buffers;
160
  struct {
161
    int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
162
    int block_ref; // A reference to which block in the given tensor_block to use.
163
    uint64_t offset; // The offset of this block.
164
  }* blocks;
165
} ccv_nnc_tensor_alloc_prep_t;
166
167
typedef struct ccv_nnc_symbolic_graph_prep_s {
168
  int flags;
169
  int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
170
  int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
171
  int exec_idx;
172
  int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
173
  int tensor_symbol_info_size;
174
  int exec_symbol_info_size;
175
  int tensor_block_size;
176
  int sub_prep_size;
177
  ccv_nnc_tensor_block_t* tensor_blocks;
178
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
179
  ccv_nnc_graph_exec_flag_t* exec_flags;
180
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
181
  int* dup_tensor_block_ref;
182
  ccv_nnc_graph_visit_t* visit;
183
  ccv_nnc_tensor_alloc_prep_t* alloc_prep;
184
  struct ccv_nnc_symbolic_graph_prep_s* p;
185
  struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
186
  // Structures that don't require to be freed after deallocation.
187
  const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
188
  ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
189
  ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
190
  ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
191
} ccv_nnc_symbolic_graph_prep_t;
192
193
typedef struct {
194
  int oc;
195
  ccv_array_t* itf;
196
} ccv_nnc_tensor_block_adjacent_t;
197
198
static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
199
6.22k
{
200
  // Compute how many dis-continuous buffers are needed.
201
  // We prefer to have several dis-continuous buffers instead of one big buffer because
202
  // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
203
  // to fully utilize memory.
204
6.22k
  int i, j, k;
205
6.22k
  ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloc(tensor_block_size, sizeof(ccv_array_t*));
206
6.22k
  int allocable_tensor_size = 0, available_tensor_size = 0;
207
97.6k
  for (i = 0; i < tensor_block_size; 
i++91.4k
)
208
91.4k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
209
30.1k
    {
210
      // Tensors that we need the header info.
211
30.1k
      ++available_tensor_size;
212
30.1k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
213
        // Tensors that we actually need to allocate (exclude the alias).
214
27.4k
        ++allocable_tensor_size;
215
30.1k
    }
216
6.22k
  ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
217
6.22k
  ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
218
6.22k
  ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
219
  // Overlap count.
220
97.6k
  for (i = 0; i < tensor_block_size; 
i++91.4k
)
221
91.4k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
222
1.68M
      
for (j = i + 1; 27.4k
j < tensor_block_size;
j++1.66M
)
223
1.66M
        if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j]))
224
637k
        {
225
          // Check to see if they interfere (default to yes).
226
          // If any of the i's head is deterministically later than j's tail
227
          // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
228
637k
          const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
229
637k
          if (i_hop_j > 0)
230
293
          {
231
293
            ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
232
293
            ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
233
293
          }
234
637k
          const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
235
637k
          if (j_hop_i > 0)
236
72.7k
          {
237
72.7k
            ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
238
72.7k
            ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
239
72.7k
          }
240
          // It cannot be that both i can hop to j can j can hop to i.
241
637k
          assert(!(i_hop_j > 0 && j_hop_i > 0));
242
637k
          if (!i_hop_j && 
!j_hop_i637k
&&
tensor_blocks[i].type == tensor_blocks[j].type564k
)
243
132k
          {
244
132k
            if (!adj[i].itf)
245
4.60k
              adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
246
132k
            ccv_array_push(adj[i].itf, &j);
247
132k
            ++adj[i].oc;
248
132k
            if (!adj[j].itf)
249
22.4k
              adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
250
132k
            ccv_array_push(adj[j].itf, &i);
251
132k
            ++adj[j].oc;
252
132k
          }
253
637k
        }
254
6.22k
  const int exec_dep_rows = exec_dep->rows;
255
6.22k
  ccv_matrix_free(exec_dep);
256
6.22k
  ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
257
6.22k
  int* const assigned = (int*)cccalloc(tensor_block_size, sizeof(int));
258
6.22k
  uint64_t* const allocated_offset = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
259
6.22k
  uint64_t* const allocated_size = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
260
6.22k
  uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
261
6.22k
  int num_assigned = 0; 
262
  // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
263
  // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
264
  // The first channel denotes the bytes available for allocation,
265
  // the second channel denotes the offset available for the allocation,
266
6.22k
  ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
267
6.22k
  ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
268
33.6k
  for (j = 0; j < allocable_tensor_size;)
269
27.3k
  {
270
    // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
271
27.3k
    uint64_t max_size = 0;
272
27.3k
    ccv_array_clear(opt);
273
27.3k
    int current_type = 0; // Deal with one type at a time.
274
4.00M
    for (i = 0; i < tensor_block_size; 
i++3.97M
)
275
3.97M
      if (tensor_blocks[i].size >= max_size &&
276
3.97M
        
TENSOR_EXPECT_COMPUTABLE2.08M
(tensor_blocks[i]) &&
!assigned[i]938k
&&
277
3.97M
        
IS_PRIMARY_COMPANION364k
(i, tensor_blocks[i]) &&
278
3.97M
        
(364k
!current_type364k
||
tensor_blocks[i].type == current_type336k
))
279
122k
      {
280
122k
        ccv_nnc_tensor_opt_t a = {
281
122k
          .size = tensor_blocks[i].size,
282
122k
          .index = i,
283
122k
          .oc = adj[i].oc,
284
122k
          .type = tensor_blocks[i].type,
285
122k
        };
286
122k
        assert(a.type);
287
122k
        current_type = a.type; // Now we now the primary type we should deal with.
288
122k
        if (tensor_blocks[i].companion_ref)
289
36
        {
290
36
          const int companion_ref = tensor_blocks[i].companion_ref - 1;
291
36
          a.size = ccv_max(a.size, tensor_blocks[companion_ref].size);
292
36
          a.oc += adj[companion_ref].oc;
293
36
        }
294
        // In case we have a tie, take them all in the array.
295
122k
        if (a.size > max_size)
296
31.9k
          ccv_array_clear(opt), max_size = a.size;
297
122k
        ccv_array_push(opt, &a);
298
122k
      }
299
27.3k
    assert(opt->rnum > 0);
300
    // Order opt array by the oc because type and size should be equal at this point.
301
27.3k
    _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
302
    // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
303
27.3k
    int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
304
27.3k
    uint64_t min_val[2] = {
305
27.3k
      0, 0
306
27.3k
    };
307
27.3k
    if (j > 0)
308
22.5k
    {
309
69.5k
      for (i = 0; i < opt->rnum; 
i++46.9k
)
310
58.0k
      {
311
58.0k
        ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
312
58.0k
        if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
313
28.7k
          continue;
314
        // Now, determine the order between a and c. After this, we can always check whether y
315
        // can hop to the earliest one and if the latest one can hop to x.
316
        // The earliest one will be called p and the latest one will be called q.
317
29.2k
        int p = a.index;
318
29.2k
        int q = a.index;
319
29.2k
        if (tensor_blocks[a.index].companion_ref)
320
16
        {
321
16
          const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
322
16
          if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
323
3
            continue;
324
13
          const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
325
13
          if (b_hop_p.i32 && 
b_hop_p.i32[0] > 01
)
326
1
            p = companion_ref;
327
12
          else {
328
12
            const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
329
12
            if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
330
12
              q = companion_ref;
331
0
            else { // Otherwise, b is in between p and q.
332
0
              const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
333
0
              const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
334
0
              assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0);
335
0
            }
336
12
          }
337
13
        }
338
29.2k
        assert
(tensor_blocks[q].type == tensor_blocks[p].type)29.2k
;
339
29.2k
        const int type = tensor_blocks[p].type;
340
        // y is always earlier than x, but this is hard to assert now.
341
        // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
342
        // Thus, the hop between y and x (through a) should be smallest ones.
343
        // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
344
        // out of q. For these nodes, we try to verify whether they form a connection (by checking against
345
        // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
346
29.2k
        int y_size = 0;
347
29.2k
        ccv_nnc_tensor_hop_t* const y_buf = buf;
348
96.2k
#define for_block(y, val) do { \
349
96.2k
          if (((int*)val)[0] > 0 && assigned[y] && 
tensor_blocks[y].type == type35.9k
&&
tensor_blocks[y].size >= a.size35.7k
) \
350
96.2k
            y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
351
35.7k
              .idx = y + 1, .hop = ((int*)val)[0] \
352
35.7k
            }; \
353
96.2k
        } while(0)
354
29.2k
        ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
355
29.2k
        if (y_vector)
356
96.2k
          
CCV_SPARSE_VECTOR_FOREACH17.9k
(tensor_dt, y_vector, for_block);
357
29.2k
#undef for_block
358
29.2k
        assert(y_size <= tensor_block_size);
359
29.2k
        int x_size = 0;
360
29.2k
        ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
361
76.8k
#define for_block(x, val) do { \
362
76.8k
          if (((int*)val)[0] > 0 && assigned[x] && 
tensor_blocks[x].type == type30.9k
&&
tensor_blocks[x].size >= a.size30.8k
) \
363
76.8k
            x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
364
30.8k
              .idx = x + 1, .hop = ((int*)val)[0] \
365
30.8k
            }; \
366
76.8k
        } while(0)
367
29.2k
        ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
368
29.2k
        if (x_vector)
369
76.8k
          
CCV_SPARSE_VECTOR_FOREACH15.9k
(tensor_df, x_vector, for_block);
370
29.2k
#undef for_block
371
29.2k
        assert(y_size + x_size <= tensor_block_size);
372
29.2k
        int x, y;
373
29.2k
        _ccv_nnc_sort_by_hops(y_buf, y_size, 0);
374
41.7k
        for (y = 0; y < y_size; 
y++12.4k
)
375
18.7k
        {
376
18.7k
          const int hop = exec_dep_rows + y_buf[y].hop;
377
18.7k
          if (hop >= min_hop)
378
0
            break;
379
18.7k
          const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
380
18.7k
          if (val.u64 && 
val.u64[0] >= a.size12.6k
)
381
6.24k
          {
382
6.24k
            min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
383
6.24k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
384
6.24k
            break;
385
6.24k
          }
386
18.7k
        }
387
29.2k
        _ccv_nnc_sort_by_hops(x_buf, x_size, 0);
388
41.7k
        for (x = 0; x < x_size; 
x++12.4k
)
389
15.5k
        {
390
15.5k
          const int hop = exec_dep_rows + x_buf[x].hop;
391
15.5k
          if (hop >= min_hop)
392
260
            break;
393
15.3k
          const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
394
15.3k
          if (val.u64 && 
val.u64[0] >= a.size3.52k
)
395
2.81k
          {
396
2.81k
            min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
397
2.81k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
398
2.81k
            break;
399
2.81k
          }
400
15.3k
        }
401
29.2k
        if (x_size > 0)
402
10.1k
        {
403
10.1k
          const int x_min_hop = x_buf[0].hop;
404
15.7k
          for (y = 0; y < y_size; 
y++5.59k
)
405
5.99k
          {
406
5.99k
            const int y_hop_p_v = y_buf[y].hop;
407
5.99k
            if (y_hop_p_v + x_min_hop >= min_hop)
408
405
              break;
409
5.59k
            ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
410
5.59k
            if (y_vector)
411
5.59k
            {
412
52.1k
              for (x = 0; x < x_size; 
x++46.5k
)
413
49.4k
              {
414
49.4k
                const int q_hop_x_v = x_buf[x].hop;
415
49.4k
                const int hop = y_hop_p_v + q_hop_x_v;
416
49.4k
                if (hop >= min_hop)
417
420
                  break;
418
49.0k
                const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
419
49.0k
                if (val.u64 && 
val.u64[0] >= a.size2.59k
)
420
2.50k
                {
421
2.50k
                  min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
422
2.50k
                    min_val[0] = val.u64[0], min_val[1] = val.u64[1];
423
2.50k
                  break;
424
2.50k
                }
425
49.0k
              }
426
5.59k
            }
427
5.59k
          }
428
10.1k
        }
429
        // If I found a place, stop, and exit.
430
29.2k
        if (min_y > 0 || 
min_x < tensor_block_size + 120.7k
)
431
11.1k
        {
432
11.1k
          min_i = i;
433
11.1k
          break;
434
11.1k
        }
435
        // There is no space to insert this block, mark it as such.
436
18.1k
        tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
437
18.1k
        if (tensor_blocks[a.index].companion_ref)
438
13
        {
439
13
          const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
440
13
          tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
441
13
        }
442
18.1k
      }
443
22.5k
    }
444
    // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
445
    // and default to largest size available.
446
27.3k
    ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i));
447
27.3k
    if (min_i == -1)
448
16.2k
    {
449
16.2k
      allocated_size[num_assigned] = a.size;
450
16.2k
      ++num_assigned;
451
16.2k
    }
452
27.3k
    int assign_group = num_assigned;
453
27.3k
    if (min_y > 0)
454
8.55k
    {
455
8.55k
      assign_group = assigned[min_y - 1];
456
      // The y and x should belong to the same assigned group.
457
8.55k
      assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group);
458
18.8k
    } else if (min_x < tensor_block_size + 1)
459
2.57k
      assign_group = assigned[min_x - 1];
460
    // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
461
27.3k
    if (min_y != 0 || 
min_x != tensor_block_size + 118.8k
)
462
11.1k
    {
463
11.1k
      uint64_t val[2] = {
464
11.1k
        min_val[0], min_val[1]
465
11.1k
      };
466
11.1k
      assert(val[0] >= a.size);
467
11.1k
      val[0] -= a.size;
468
11.1k
      val[1] = val[1] + a.size; // Move the offset to the next one.
469
11.1k
      ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
470
11.1k
    }
471
27.3k
    int strings[3];
472
27.3k
    strings[0] = a.index + 1;
473
27.3k
    int string_size = 1;
474
    // Assign out designated companion if it exist.
475
27.3k
    if (tensor_blocks[a.index].companion_ref)
476
20
    {
477
20
      const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
478
20
      assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type);
479
20
      const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
480
20
      if (b_hop_p.i32 && 
b_hop_p.i32[0] > 02
)
481
2
      {
482
4
        for (i = 0; i < string_size; 
i++2
)
483
2
          strings[i + 1] = strings[i];
484
2
        strings[0] = companion_ref + 1;
485
18
      } else {
486
18
        const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
487
18
        if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
488
18
          strings[string_size] = companion_ref + 1;
489
0
        else {
490
          // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
491
0
          assert(string_size == 2);
492
0
          strings[2] = strings[1];
493
0
          strings[1] = companion_ref + 1;
494
0
        }
495
18
      }
496
20
      ++string_size;
497
20
    }
498
    // Assign out and update oc.
499
54.7k
    
for (i = 0; 27.3k
i < string_size;
i++27.4k
)
500
27.4k
    {
501
27.4k
      const int index = strings[i] - 1;
502
      // Assign out the selected one.
503
27.4k
      assigned[index] = assign_group;
504
      // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
505
27.4k
      allocated_offset[index] = min_val[1];
506
27.4k
      if (adj[index].itf)
507
292k
        
for (k = 0; 27.0k
k < adj[index].itf->rnum;
k++265k
)
508
265k
        {
509
265k
          const int d = *(int*)ccv_array_get(adj[index].itf, k);
510
265k
          if (!assigned[d] && 
TENSOR_EXPECT_COMPUTABLE132k
(tensor_blocks[d]))
511
132k
            --adj[d].oc;
512
265k
        }
513
27.4k
    }
514
27.3k
    uint64_t val[2] = {
515
27.3k
      a.size, min_val[1]
516
27.3k
    };
517
27.3k
    uint64_t consumed_size = 0;
518
    // Go over from min_y to string_size (excluding min_x).
519
27.3k
    for (i = 0; i < string_size; 
i++0
)
520
27.3k
    {
521
27.3k
      const uint64_t size = tensor_blocks[strings[i] - 1].size;
522
27.3k
      assert(size <= a.size);
523
      // Update consumed size if it is bigger than "size".
524
27.3k
      if (size > consumed_size)
525
27.3k
      {
526
27.3k
        val[0] = size - consumed_size;
527
27.3k
        ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
528
27.3k
        consumed_size = size;
529
27.3k
        val[1] = min_val[1] + consumed_size;
530
27.3k
      }
531
      // If it consumed all the flow, break out.
532
27.3k
      if (consumed_size == a.size)
533
27.3k
        break;
534
27.3k
    }
535
54.7k
    
for (i = 0; 27.3k
i < string_size;
i++27.4k
)
536
27.4k
    {
537
27.4k
      const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
538
27.4k
      uint64_t val[2] = {
539
27.4k
        i_size, min_val[1]
540
27.4k
      };
541
27.4k
      uint64_t consumed_size = 0;
542
27.4k
      for (k = i + 1; k < string_size; 
k++0
)
543
20
      {
544
20
        const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size);
545
        // Update consumed size if it is bigger than "size".
546
20
        if (size > consumed_size)
547
20
        {
548
20
          val[0] = size - consumed_size;
549
20
          ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
550
20
          consumed_size = size;
551
20
          val[1] = min_val[1] + consumed_size;
552
20
        }
553
        // If it consumed all the flow, break out.
554
20
        if (consumed_size == i_size)
555
20
          break;
556
20
      }
557
27.4k
      val[0] = i_size - consumed_size;
558
      // Still have residual, flow it to min_x.
559
27.4k
      if (val[0] > 0)
560
27.3k
        ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
561
27.4k
    }
562
27.3k
    if (min_i == -1)
563
16.2k
    {
564
      // If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
565
16.2k
      const int p = strings[0] - 1;
566
16.2k
      const int q = strings[string_size - 1] - 1;
567
16.2k
      const int type = tensor_blocks[p].type;
568
16.2k
#define for_block(y, val) 
do 9.13k
{ \
569
9.13k
        if (((int*)val)[0] > 0 && !assigned[y] && 
tensor_blocks[y].type == type5.04k
&&
tensor_blocks[y].size <= a.size4.98k
) \
570
9.13k
        { \
571
4.98k
          tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
572
4.98k
          if (tensor_blocks[y].companion_ref) \
573
4.98k
          { \
574
3
            const int companion_ref = tensor_blocks[y].companion_ref - 1; \
575
3
            tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
576
3
          } \
577
4.98k
        } \
578
9.13k
      } while(0)
579
16.2k
      ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
580
16.2k
      if (y_vector)
581
9.13k
        
CCV_SPARSE_VECTOR_FOREACH4.59k
(tensor_dt, y_vector, for_block);
582
16.2k
#undef for_block
583
30.0k
#define for_block(x, val) do { \
584
30.0k
        if (((int*)val)[0] > 0 && !assigned[x] && 
tensor_blocks[x].type == type14.3k
&&
tensor_blocks[x].size <= a.size14.2k
) \
585
30.0k
        { \
586
14.2k
          tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
587
14.2k
          if (tensor_blocks[x].companion_ref) \
588
14.2k
          { \
589
2
            const int companion_ref = tensor_blocks[x].companion_ref - 1; \
590
2
            tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
591
2
          } \
592
14.2k
        } \
593
30.0k
      } while(0)
594
16.2k
      ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
595
16.2k
      if (x_vector)
596
30.0k
        
CCV_SPARSE_VECTOR_FOREACH9.51k
(tensor_df, x_vector, for_block);
597
16.2k
#undef for_block
598
16.2k
    }
599
27.3k
    j += string_size;
600
27.3k
  }
601
6.22k
  ccfree(tensor_block_cannot_insert);
602
6.22k
  ccfree(buf);
603
6.22k
  ccv_array_free(opt);
604
6.22k
  ccv_matrix_free(tensor_df);
605
6.22k
  ccv_matrix_free(tensor_dt);
606
54.7k
#define for_block(y, x, val) do { \
607
54.7k
    if (((uint64_t*)val)[0] > 0 && 
y > 044.6k
&&
x < tensor_block_size + 128.0k
) \
608
54.7k
    { \
609
11.3k
      if (!alloc_dep[x - 1]) \
610
11.3k
        
alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0)10.9k
; \
611
11.3k
      ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
612
11.3k
    } \
613
54.7k
  } while (0)
614
54.7k
  
CCV_SPARSE_FOREACH6.22k
(alloc, for_block);
615
6.22k
#undef for_block
616
6.22k
  ccv_matrix_free(alloc);
617
97.6k
  for (i = 0; i < tensor_block_size; 
i++91.4k
)
618
91.4k
    if (adj[i].itf)
619
27.0k
      ccv_array_free(adj[i].itf);
620
6.22k
  ccfree(adj);
621
6.22k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
622
6.22k
  alloc_prep->alloc_dep = alloc_dep;
623
6.22k
  alloc_prep->vt_block_size = tensor_block_size;
624
6.22k
  alloc_prep->buffer_size = num_assigned;
625
6.22k
  alloc_prep->block_size = available_tensor_size;
626
6.22k
  alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
627
6.22k
  alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
628
6.22k
  alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
629
6.22k
  memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
630
22.4k
  for (i = 0; i < num_assigned; 
i++16.2k
)
631
16.2k
    alloc_prep->buffers[i].size = allocated_size[i];
632
6.22k
  if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO))
633
0
  {
634
0
    size_t total_size = 0;
635
0
    for (i = 0; i < num_assigned; i++)
636
0
      total_size += allocated_size[i];
637
0
    PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size);
638
0
  }
639
6.22k
  ccfree(allocated_size);
640
6.22k
  j = 0;
641
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
642
97.6k
  for (i = 0; i < tensor_block_size; 
i++91.4k
)
643
91.4k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
644
30.1k
    {
645
30.1k
      alloc_prep->blocks[j].block_ref = i;
646
30.1k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
647
27.4k
      {
648
27.4k
        alloc_prep->vt_blocks[i] = j;
649
        // Also, set its allocations.
650
27.4k
        assert(assigned[i] > 0);
651
27.4k
        const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
652
27.4k
        alloc_prep->blocks[j].offset = allocated_offset[i];
653
27.4k
        if (!alloc_prep->buffers[buffer_ref].type)
654
16.2k
          alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
655
27.4k
        alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || 
tensor_blocks[i].pin_mem27.3k
;
656
27.4k
        alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i]);
657
27.4k
        assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size);
658
27.4k
      } else {
659
2.71k
        alloc_prep->vt_blocks[i] = -1;
660
2.71k
        alloc_prep->blocks[j].buffer_ref = -1;
661
2.71k
        alloc_prep->blocks[j].offset = 0;
662
2.71k
      }
663
30.1k
      ++j;
664
30.1k
    } else
665
61.3k
      alloc_prep->vt_blocks[i] = -1;
666
6.22k
  ccfree(allocated_offset);
667
6.22k
  ccfree(assigned);
668
6.22k
  return alloc_prep;
669
6.22k
}
670
671
static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
672
6.22k
{
673
6.22k
  int i;
674
97.6k
  for (i = 0; i < alloc_prep->vt_block_size; 
i++91.4k
)
675
91.4k
    if (alloc_prep->alloc_dep[i])
676
10.9k
      ccv_array_free(alloc_prep->alloc_dep[i]);
677
22.4k
  for (i = 0; i < alloc_prep->buffer_size; 
i++16.2k
)
678
16.2k
    if (alloc_prep->buffers[i].dup_p_refs)
679
13
      ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
680
6.22k
  ccfree(alloc_prep->alloc_dep);
681
6.22k
  ccfree(alloc_prep);
682
6.22k
}
683
684
// Simple allocator from ccv_array_t.
685
static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
686
76.7k
{
687
76.7k
  int pos = tensor_metadata->rnum;
688
76.7k
  int rsize = (size + 15) / 16;
689
76.7k
  ccv_array_resize(tensor_metadata, pos + rsize);
690
76.7k
  return (pos << 1) + 1;
691
76.7k
}
692
693
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
694
163k
{
695
163k
  assert((pos >> 1) < tensor_metadata->rnum);
696
163k
  return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1);
697
163k
}
698
699
83.6k
#define CCV_NNC_IS_METADATA_POS(ptr) ((uintptr_t)(
ptr590
) & 1)
700
701
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
702
83.1k
{
703
  // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
704
83.1k
  if (!CCV_NNC_IS_METADATA_POS(vt_tensor))
705
0
    return vt_tensor;
706
83.1k
  ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
707
83.1k
  if (tensor->alias_ref && 
CCV_NNC_IS_METADATA_POS100
(tensor->alias_ref))
708
80
  {
709
80
    const int alias_ref = tensor->alias_ref;
710
80
    tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
711
80
    _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
712
80
  }
713
83.1k
  if (CCV_IS_TENSOR_MULTIVIEW(tensor))
714
84
  {
715
84
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
716
84
    int i;
717
84
    const int count = mv->kind + mv->repeat;
718
267
    for (i = 0; i < count; 
i++183
)
719
183
    {
720
183
      if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
721
147
      {
722
147
        const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i];
723
147
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
724
147
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
725
147
      }
726
183
    }
727
    // No need to recursively do parent pointer, otherwise we are in deep rewire.
728
84
    if (mv->p && 
CCV_NNC_IS_METADATA_POS11
(mv->p))
729
0
      mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
730
84
    if (mv->sp)
731
65
      
for (i = 0; 28
i < mv->sp->rnum;
i++37
)
732
37
      {
733
37
        ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
734
37
        if (CCV_NNC_IS_METADATA_POS(*tensor))
735
30
        {
736
30
          const int pos = (int)(intptr_t)*tensor;
737
30
          *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
738
30
          assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
739
30
          _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
740
30
        }
741
37
      }
742
84
  }
743
83.1k
  return tensor;
744
83.1k
}
745
746
typedef struct {
747
  const uint8_t* ptr;
748
  int pos;
749
} ccv_nnc_tensor_block_pos_t;
750
751
static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
752
114
{
753
114
  int i;
754
114
  int unref_block_ref = block_ref;
755
120
  while (prep->tensor_blocks[unref_block_ref].ref)
756
6
    unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
757
114
  int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
758
114
  assert(vt_ref >= 0);
759
114
  assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref);
760
114
  const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
761
114
  uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
762
114
  int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
763
114
  for (i = idx - 1; i >= 0; 
i--0
)
764
114
  {
765
114
    assert(p_ref >= 0);
766
114
    const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
767
114
    const int unroll_count = graph_prep->unroll_count;
768
114
    if (ch[i]) // Prefer the dup side of things.
769
12
      p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
770
114
    int unref_p_ref = p_ref;
771
114
    while (graph_prep->tensor_blocks[unref_p_ref].ref)
772
0
      unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
773
114
    vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
774
114
    const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
775
114
    offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
776
    // If the buffer already exists, prefer that.
777
114
    const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
778
114
    if (ptr)
779
114
    {
780
      // If I have any remaining path that is not covered from 0, I cannot possibly
781
      // have any pointer from buffer (that can only happen if it is not dup).
782
138
      for (--i; i >= 0; 
i--24
)
783
24
        if (ch[i] != 0)
784
0
          return 0;
785
      // Try to find the created tensor block pos in the array, just linear scan.
786
114
      const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
787
114
      ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
788
114
      *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
789
114
      ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
790
114
      return tv_pos;
791
114
    }
792
0
    p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
793
0
  }
794
0
  return 0;
795
114
}
796
797
// Descent from root to the prep level, and compose multiview from there.
798
static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
799
114
{
800
114
  assert(pos_ref);
801
114
  int i;
802
114
  const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
803
114
  const int unroll_count = prep->unroll_count;
804
114
  if (prep == graph_prep)
805
57
  {
806
57
    const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
807
57
    if (!data_pos)
808
0
      return -1;
809
    // Based on ch, go all the way back to find the exact pointer to compose.
810
57
    if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
811
57
      prep->dup_tensor_block_ref &&
812
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] >= 041
&&
813
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref41
)
814
41
    {
815
41
      int pos[unroll_count + 1];
816
41
      pos[0] = data_pos;
817
98
      for (i = 0; i < unroll_count; 
i++57
)
818
57
        pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
819
41
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
820
41
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
821
41
      ccv_nnc_tensor_t* data[unroll_count + 1];
822
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
823
98
        data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
824
41
      ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
825
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
826
98
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
827
41
      *pos_ref = mv_pos;
828
41
    } else {
829
16
      *pos_ref = data_pos;
830
16
    }
831
57
    if (preserve)
832
5
    {
833
      // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
834
      // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
835
      // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
836
      // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
837
      // arena allocated).
838
      // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
839
      // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
840
      // it to a K01 structure.
841
      // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
842
      // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
843
      // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
844
5
      int prev_mv_pos = *pos_ref;
845
5
      if (prev_mv_pos == -1)
846
0
      {
847
0
        prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
848
0
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
849
0
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
850
0
        ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
851
0
          tv,
852
0
        }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
853
0
        CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
854
0
      }
855
5
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
856
5
      ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
857
5
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
858
5
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
859
5
        CCV_NNC_TENSOR_PLACEHOLDER,
860
5
        (ccv_nnc_tensor_t*)prev_mv,
861
5
      }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
862
5
      prev_mv->p = (void*)(intptr_t)mv_pos;
863
5
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
864
5
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
865
5
      *pos_ref = mv_pos;
866
5
    }
867
57
    return 0;
868
57
  }
869
57
  ch[idx] = 0;
870
57
  int pos[unroll_count + 1];
871
57
  pos[0] = 0;
872
57
  const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
873
57
  assert(retval == 0);
874
67
  
for (i = 0; 57
i < unroll_count;
i++10
)
875
10
  {
876
10
    ch[idx] = i + 1;
877
10
    pos[i + 1] = 0;
878
10
    const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
879
10
    if (dup_retval < 0)
880
0
    {
881
0
      assert(i == 0);
882
0
      break;
883
0
    }
884
10
  }
885
  // If current prep has no dup.
886
57
  if (i == 0)
887
47
  {
888
47
    *pos_ref = pos[0];
889
47
    return 0;
890
47
  }
891
10
  ccv_nnc_tensor_t* data[unroll_count + 1];
892
  // Compose to a new multiview.
893
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
894
20
    { assert(pos[i] > 0); }
895
10
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
896
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
897
20
    data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
898
10
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
899
10
  ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
900
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
901
20
    if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER && CCV_IS_TENSOR_MULTIVIEW(data[i]))
902
4
      ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
903
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
904
20
    CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
905
10
  *pos_ref = mv_pos;
906
10
  return 0;
907
10
}
908
909
static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
910
312
{
911
312
  int i;
912
312
  int is_input = 0;
913
312
  assert(node);
914
766
  
for (i = 0; 312
i < node->input_size &&
!is_input529
;
i++454
)
915
454
    if (p_ref == node->inputs[i])
916
153
      is_input = 1;
917
312
  int is_output = 0;
918
725
  for (i = 0; i < node->output_size && 
!is_output465
;
i++413
)
919
413
    if (p_ref == node->outputs[i])
920
167
      is_output = 1;
921
  // Prefer it is an output if it is both the input and the output.
922
312
  if (is_output)
923
167
    return 1;
924
145
  if (is_input)
925
145
    return -1;
926
0
  return 0;
927
145
}
928
929
static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
930
61
{
931
  // No need to check whether to preserve if this is not a while loop.
932
61
  if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
933
8
    return 0;
934
61
  assert
(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)53
;
935
  // If it is unassigned, no need to preserve.
936
53
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
937
2
    return 0;
938
51
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
939
  // If p is not input, no need to preserve at all.
940
51
  if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
941
19
    return 0;
942
32
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
943
32
  assert(vt_ref >= 0);
944
32
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
945
32
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
946
  // If the buffer is a truly read-only one, no need to preserve.
947
32
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
948
6
    return 0;
949
  /* This needs detailed explanation, what does preserve mean?
950
   * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
951
   * also used outside of the while loop, we cannot reuse the memory region of x for
952
   * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
953
   * y uses the same memory region as x). The way to workaround this is by using a different
954
   * memory region for y = x + 1, but for the first iteration, having x pointing to the
955
   * original. During the allocation process, the way to identify whether x should preserve
956
   * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
957
   * parent tensor is the same as the memory region it plans to use in the buffer, then we are
958
   * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
959
   * it is the input tensor whenever that is possible. A tensor block can point to two parent
960
   * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
961
   * tensor whenever that is possible. */
962
26
  if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
963
15
    return 0;
964
  // Otherwise, return 1 because we now need to preserve.
965
11
  return 1;
966
26
}
967
968
static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
969
58
{
970
58
  assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size);
971
  // If it is unassigned, no need to preserve.
972
58
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
973
0
    return 0;
974
  // Only tape var need to force broadcast, otherwise we already share the same memory region.
975
58
  if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
976
54
    return 0;
977
4
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
978
  // If p is not output, no need to broadcast at all.
979
4
  if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
980
3
    return 0;
981
1
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
982
1
  assert(vt_ref >= 0);
983
1
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
984
1
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
985
  // If the buffer is a truly read-only one, no need to broadcast.
986
1
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
987
0
    return 0;
988
  // Otherwise, return 1 because we now need to force broadcast for this tape var.
989
1
  return 1;
990
1
}
991
992
static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
993
25
{
994
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
995
25
  int i;
996
78
  for (i = 0; i < mv->kind + mv->repeat; 
i++53
)
997
53
    if (CCV_NNC_MULTIVIEW_DATA(mv)[i] == CCV_NNC_TENSOR_PLACEHOLDER)
998
8
      CCV_NNC_MULTIVIEW_DATA(mv)[i] = tensor;
999
45
    else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1000
7
      _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], tensor);
1001
25
}
1002
1003
static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1004
25
{
1005
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1006
25
  int i;
1007
25
  if (mv->sp)
1008
8
    
for (i = 0; 2
i < mv->sp->rnum;
i++6
)
1009
6
    {
1010
6
      ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
1011
6
      if (CCV_NNC_IS_METADATA_POS(*tensor))
1012
1
      {
1013
1
        const int pos = (int)(intptr_t)*tensor;
1014
1
        *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1015
1
        assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
1016
1
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1017
1
      }
1018
6
    }
1019
78
  
for (i = 0; 25
i < mv->kind + mv->repeat;
i++53
)
1020
53
  {
1021
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1022
8
      CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1023
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref))
1024
0
      CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref);
1025
53
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1026
7
      _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1027
53
  }
1028
25
}
1029
1030
static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1031
47
{
1032
  // Go to the root of the graph.
1033
47
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1034
47
  int i;
1035
104
  for (i = 1; prep->p; 
i++57
)
1036
57
    prep = prep->p;
1037
  // Root graph should have no dup tensor blocks.
1038
47
  assert(!prep->dup_tensor_block_ref);
1039
47
  const int c = i;
1040
47
  const ccv_nnc_symbolic_graph_prep_t* preps[c];
1041
47
  prep = graph_prep;
1042
47
  preps[c - 1] = prep;
1043
104
  for (i = 0; prep->p; 
i++57
)
1044
57
    preps[c - 2 - i] = prep = prep->p;
1045
47
  int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1046
47
  memset(ch, 0, sizeof(int) * c);
1047
47
  int pos = 0;
1048
47
  _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1049
47
  assert(ch[c - 1] == 0); // This shouldn't never be modified.
1050
47
  assert(pos > 0);
1051
47
  return pos;
1052
47
}
1053
1054
static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1055
3
{
1056
3
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1057
3
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1058
3
  ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : 
tensor0
;
1059
3
  ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1060
3
    CCV_NNC_TENSOR_PLACEHOLDER,
1061
3
    tv,
1062
3
  }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1063
3
  CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
1064
3
  CCV_NNC_MULTIVIEW_DATA(mv)[1] = tensor;
1065
3
  return mv_pos;
1066
3
}
1067
1068
static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1069
30
{
1070
30
  ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1071
30
  const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr);
1072
30
  if (!is_multiview)
1073
18
    return pos;
1074
24
  
while (12
CCV_IS_TENSOR_MULTIVIEW(tensor_ptr))
1075
12
  {
1076
12
    const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1077
12
    tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1078
12
  }
1079
12
  const ccv_nnc_tensor_t tensor = *tensor_ptr;
1080
12
  const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1081
12
  ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1082
12
  *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1083
12
  new_tensor->dataof = tensor.dataof;
1084
12
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1085
12
  new_tensor->alias_ref = (uintptr_t)pos;
1086
12
  ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1087
12
  return new_pos;
1088
30
}
1089
1090
static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1091
2.69k
{
1092
2.69k
  const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1093
  // It referenced to is not an alias.
1094
2.69k
  assert(vt_tensors[alias_ref]);
1095
2.69k
  const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1096
2.69k
  const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1097
2.69k
  assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr));
1098
  // Will use that to determine whether insert reference or not.
1099
2.69k
  const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr);
1100
2.70k
  while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr))
1101
13
  {
1102
13
    const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1103
13
    alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1104
13
  }
1105
2.69k
  const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1106
  // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1107
2.69k
  int pos;
1108
2.69k
  if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1109
2.69k
    
ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim)2.66k
)
1110
2.63k
  {
1111
2.63k
    pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1112
2.63k
    ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1113
2.63k
    *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1114
2.63k
    tensor->dataof = alias_tensor.dataof;
1115
2.63k
  } else {
1116
59
    pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1117
59
    ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1118
    // Otherwise initialize a tensor view
1119
59
    *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1120
59
    tensor_view->alias_ref = (uintptr_t)alias_pos;
1121
59
  }
1122
2.69k
  vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1123
2.69k
  if (is_multiview)
1124
13
  {
1125
13
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1126
13
    ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1127
13
  }
1128
2.69k
}
1129
1130
static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1131
2.69k
{
1132
  // If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1133
2.69k
  if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]) && 
tensor_blocks[block_ref].alias_ref3
&&
!vt_tensors[block_ref]3
)
1134
3
  {
1135
3
    const int ref = tensor_blocks[block_ref].alias_ref - 1;
1136
3
    if (!vt_tensors[ref])
1137
0
      _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1138
3
    vt_tensors[block_ref] = vt_tensors[ref];
1139
3
    return;
1140
3
  }
1141
2.69k
  assert
(tensor_symbol_info[block_ref].alias_ref)2.69k
;
1142
2.69k
  const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1143
  // If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1144
2.69k
  if (!vt_tensors[alias_ref])
1145
3
    _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1146
2.69k
  _ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1147
2.69k
}
1148
1149
// Turn a linear pointer to an object storage (such as MTLBuffer).
1150
#ifdef HAVE_MPS
1151
static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1152
{
1153
  mpobjfree(0, ptr);
1154
}
1155
#endif
1156
1157
typedef struct {
1158
  size_t size;
1159
  void* obj;
1160
} tensor_arena_obj_track_t;
1161
1162
typedef struct {
1163
  void* ptr;
1164
  off_t offset;
1165
  size_t size;
1166
} obj_ptr_key_t;
1167
1168
static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1169
0
{
1170
0
  return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1171
0
}
1172
1173
static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1174
0
{
1175
0
  return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1176
0
}
1177
1178
KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)
1179
1180
static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1181
27.2k
{
1182
27.2k
  if (params.dim[0] == 0)
1183
0
    return 0;
1184
#ifdef HAVE_MPS
1185
  if (CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_GPU_MEMORY)
1186
  {
1187
    int ret;
1188
    const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype) * ccv_nnc_tensor_count(params);
1189
    const obj_ptr_key_t key = {
1190
      .ptr = ptr,
1191
      .offset = offset,
1192
      .size = size,
1193
    };
1194
    khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret);
1195
    if (ret != 0)
1196
    {
1197
      void* obj = mpobjcreate(ptr, offset, size);
1198
      if (!tensor_arena->disposers)
1199
        tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1200
      ccv_nnc_arena_disposer_t disposer = {
1201
        .ptr = obj,
1202
        .userdata = 0,
1203
        .dispose = _ccv_nnc_tensor_arena_obj_dispose
1204
      };
1205
      ccv_array_push(tensor_arena->disposers, &disposer);
1206
      kh_val(obj_ptr_map, k) = obj;
1207
      return obj;
1208
    } else
1209
      return kh_val(obj_ptr_map, k);
1210
  }
1211
#endif
1212
27.2k
  return ptr + offset;
1213
27.2k
}
1214
1215
static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1216
6.22k
{
1217
  // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1218
  // Each tensor have the designation in assigned array, and offset in allocated_offset.
1219
6.22k
  const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1220
6.22k
  ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1221
6.22k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1222
6.22k
  const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1223
6.22k
  const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1224
6.22k
  const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? 
p_graph_prep->alloc_prep49
:
06.17k
;
1225
6.22k
  const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1226
6.22k
  const int unroll_count = graph_prep->unroll_count;
1227
6.22k
  int i, j;
1228
97.5k
  for (i = 0; i < tensor_symbol_info_size; 
i++91.2k
)
1229
91.2k
    
for (j = 0; 91.2k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) &&
j < unroll_count61.3k
;
j++7
)
1230
7
    {
1231
7
      const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1232
7
      if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref]))
1233
3
        TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i]);
1234
7
    }
1235
6.22k
  ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1236
6.22k
  graph_prep->tensor_arena = tensor_arena;
1237
6.22k
  tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1238
6.22k
  tensor_arena->buffers = (void*)(tensor_arena + 1);
1239
6.22k
  tensor_arena->buffer_size = alloc_prep->buffer_size;
1240
6.22k
  tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1241
6.22k
  tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1242
6.22k
  tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1243
6.22k
  tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1244
6.22k
  tensor_arena->pb_vt_tensors = 0;
1245
6.22k
  tensor_arena->vt_alias_r_refs_p = 0;
1246
6.22k
  tensor_arena->vt_alias_r_refs = 0;
1247
6.22k
  tensor_arena->vt_sizes = 0;
1248
6.22k
  tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1249
6.22k
  tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
1250
6.22k
  tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1251
6.22k
  tensor_arena->allocator.context.free = allocator.context.free;
1252
6.22k
  tensor_arena->allocator.isa = allocator.isa;
1253
6.22k
  tensor_arena->disposers = 0;
1254
  // Copy alias_ref info back to the tensor arena.
1255
97.5k
  for (i = 0; i < tensor_symbol_info_size; 
i++91.2k
)
1256
91.2k
    tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1257
  // Do the buffer copies.
1258
22.4k
  for (i = 0; i < alloc_prep->buffer_size; 
i++16.2k
)
1259
16.2k
    tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1260
16.2k
      tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1261
16.2k
      tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1262
6.22k
  if (graph_prep->while_count_tensor)
1263
19
  {
1264
    // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1265
19
    int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1266
19
    assert((0 << 1) + 1 == pos); // pos must be 0 position.
1267
19
    ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1268
19
    *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1269
19
  }
1270
6.22k
  assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep));
1271
6.22k
  if (p_arena && 
p_graph_prep49
)
1272
49
  {
1273
    // Don't need to allocate the actual buffer, just use the pointer from the above.
1274
49
    PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena);
1275
229
    for (i = 0; i < tensor_arena->buffer_size; 
i++180
)
1276
180
    {
1277
180
      const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1278
180
      int unref_p_ref = p_ref;
1279
182
      while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1280
2
        unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1281
180
      assert(unref_p_ref >= 0);
1282
180
      const int p_unroll_count = p_graph_prep->unroll_count;
1283
180
      if (p_graph_prep->dup_tensor_block_ref &&
1284
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 016
&&
1285
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref16
)
1286
10
      {
1287
        // This condition means in the parent graph, we point to multiple tensor blocks for the same
1288
        // buffer, therefore, we cannot have one single pointer assigned in this case.
1289
        // Later we will handle this by generate ccv_tensor_multiview_t structure.
1290
10
        tensor_arena->buffers[i].ptr = 0;
1291
10
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1292
10
        continue;
1293
10
      }
1294
      // Otherwise, find the actual buffer pointer.
1295
170
      const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1296
170
      assert(vt_ref >= 0);
1297
170
      const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1298
170
      if (!p_arena->buffers[buffer_ref].ptr)
1299
0
      {
1300
        // Pass it down as 0 ptr.
1301
0
        tensor_arena->buffers[i].ptr = 0;
1302
0
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1303
0
        continue;
1304
0
      }
1305
170
      const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1306
170
      tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1307
170
      PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset);
1308
170
    }
1309
6.17k
  } else {
1310
    // Now, allocate actual buffers.
1311
6.17k
    PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena);
1312
22.2k
    for (i = 0; i < tensor_arena->buffer_size; 
i++16.0k
)
1313
16.0k
    {
1314
16.0k
      const int buffer_type = tensor_arena->buffers[i].type;
1315
16.0k
      const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
1316
16.0k
#ifdef HAVE_CUDA
1317
16.0k
      if (memory_type == CCV_TENSOR_GPU_MEMORY)
1318
2.35k
      {
1319
2.35k
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
1320
2.35k
        if (allocator.isa && 
allocator.isa->alloc266
)
1321
266
          tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1322
2.09k
        else
1323
2.09k
          tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1324
2.35k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1325
13.7k
      } else {
1326
13.7k
        assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1327
13.7k
        if (tensor_arena->buffers[i].pin_mem)
1328
17
          tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1329
13.7k
        else
1330
13.7k
          ccmemalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1331
13.7k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1332
13.7k
      }
1333
#elif defined(HAVE_MPS)
1334
      if (memory_type == CCV_TENSOR_GPU_MEMORY)
1335
      {
1336
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
1337
        // if (allocator.isa && allocator.isa->alloc)
1338
        //  tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1339
        // else
1340
        tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1341
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1342
      } else {
1343
        assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1344
        ccmemalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1345
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1346
      }
1347
#else
1348
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1349
      ccmemalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1350
      PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1351
#endif
1352
16.0k
      assert(tensor_arena->buffers[i].ptr);
1353
16.0k
    }
1354
6.17k
  }
1355
  // Go over sub_preps and allocate arenas for them. Do it this early because
1356
  // we may reference tensors from sub arenas, the reason why we need to reference
1357
  // tensors from sub arenas is because for output tensors, sub arena's tensor
1358
  // will have automatic reference updates.
1359
6.27k
  
for (i = 0; 6.22k
i < tensor_arena->sub_arena_size;
i++50
)
1360
50
    if (graph_prep->sub_preps[i])
1361
49
      tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1362
1
    else
1363
1
      tensor_arena->sub_arenas[i] = 0;
1364
6.22k
  memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1365
  // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1366
6.22k
  ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? 
(ccv_nnc_tensor_t**)29
cccalloc29
(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) :
06.19k
;
1367
#ifdef HAVE_MPS
1368
  khash_t(obj_ptr)* obj_ptr_map = kh_init(obj_ptr);
1369
#else
1370
6.22k
  khash_t(obj_ptr)* obj_ptr_map = 0;
1371
6.22k
#endif
1372
6.27k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1373
50
    if (tensor_arena->sub_arenas[i])
1374
49
    {
1375
49
      assert(graph_prep->sub_preps[i]);
1376
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1377
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1378
49
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1379
45
        
for (j = 0; 21
j < node->output_size;
j++24
)
1380
24
        {
1381
24
          const int idx = node->outputs[j];
1382
24
          const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i) - 1;
1383
24
          assert(s_idx >= 0);
1384
24
          ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1385
24
          assert(sub_arena_out_tensors[idx] == 0);
1386
24
          ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1387
          // Only assign if it is a multiview tensor.
1388
24
          if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor) ||
1389
24
            
(8
sub_alias8
&&
CCV_IS_TENSOR_MULTIVIEW1
(sub_alias)))
1390
17
            sub_arena_out_tensors[idx] = sub_tensor;
1391
24
        }
1392
49
    }
1393
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
1394
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1395
91.2k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
1396
27.2k
    {
1397
27.2k
      const int vt_ref = alloc_prep->vt_blocks[i];
1398
27.2k
      const int buffer_ref = vt_ref >= 0 ? 
alloc_prep->blocks[vt_ref].buffer_ref27.2k
:
-13
;
1399
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1400
      // previous layer, therefore, cannot really find the buffer ptr.
1401
27.2k
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[i]101
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1402
27.2k
        
(27.2k
(27.2k
graph_prep->dup_tensor_block_ref27.2k
&&
1403
27.2k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] >= 059
&&
1404
27.2k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] != i57
) ||
1405
27.2k
         
(27.2k
buffer_ref >= 027.2k
&&
!tensor_arena->buffers[buffer_ref].ptr27.2k
)))
1406
47
      {
1407
47
        assert(graph_prep->p); // This must be in a sub-graph.
1408
        // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1409
47
        if (graph_prep->tensor_blocks[i].p_refs[0] && 
_ccv_nnc_tensor_block_check_preserve(graph_prep, i)36
)
1410
4
          continue;
1411
43
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1412
43
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1413
43
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1414
27.2k
      } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])) {
1415
        // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1416
27.2k
        const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1417
        // If already created, use the same tensor, and continue.
1418
        // Having ptr.
1419
27.2k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1420
27.2k
        ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1421
        // Also, set its allocations.
1422
        // Since tensor view is bit compatible with tensor, we can just cast.
1423
27.2k
        void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1424
27.2k
        *tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1425
27.2k
        assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size);
1426
        // If we need to force broadcast, we need to wrap it in a multiview.
1427
27.2k
        if (graph_prep->tensor_blocks[i].p_refs[0] &&
1428
27.2k
          
_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i)58
)
1429
1
        {
1430
1
          const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1431
1
          ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1432
1
          ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1433
1
          ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1434
1
            tv,
1435
1
          }, 0, 1, graph_prep->graph, mv);
1436
1
          CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1437
1
          pos = mv_pos;
1438
1
          ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1439
1
        }
1440
27.2k
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1441
27.2k
      }
1442
27.2k
    }
1443
#ifdef HAVE_MPS
1444
  kh_destroy(obj_ptr, obj_ptr_map);
1445
#endif
1446
  // Handle binded tensors. First handle cases without aliases.
1447
53.8k
  
for (i = 0; 6.22k
i < tensor_bind_size;
i++47.5k
)
1448
47.5k
  {
1449
47.5k
    assert(tensor_binds[i].tensor);
1450
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1451
47.5k
    if (resolved_symbol.d >= 0)
1452
47.5k
    {
1453
47.5k
      int d = resolved_symbol.d;
1454
47.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1455
1.02k
        continue;
1456
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1457
      // It has nothing to do with alias.
1458
46.7k
      
while (46.5k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1459
146
        d = tensor_blocks[d].ref - 1;
1460
      // For binded tensors, it shouldn't be assigned yet.
1461
      // If it is assigned, the pointer should match the ones from the binded tensor.
1462
      // This can only happen if an enforced in-place tensor is binded twice. If that
1463
      // happens, we need to make sure it is binded to the same location.
1464
46.5k
      assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1465
      // See above assertion.
1466
46.5k
      if (tensor_arena->vt_tensors[d])
1467
0
        continue;
1468
46.5k
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1469
0
      {
1470
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1471
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1472
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1473
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1474
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1475
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1476
        // It is OK to be just as a whole smaller or equal to the binded one.
1477
0
        assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info));
1478
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1479
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1480
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1481
46.5k
      } else {
1482
46.5k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1483
46.5k
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1484
46.5k
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1485
46.5k
        tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1486
46.5k
        tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1487
46.5k
        tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1488
46.5k
        tv->dataof = tensor_binds[i].tensor->dataof;
1489
46.5k
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1490
46.5k
      }
1491
46.5k
    }
1492
47.5k
  }
1493
  // Handle binded tensors. We handle alias here so it can reference to binded tensors.
1494
53.8k
  
for (i = 0; 6.22k
i < tensor_bind_size;
i++47.5k
)
1495
47.5k
  {
1496
47.5k
    assert(tensor_binds[i].tensor);
1497
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1498
47.5k
    if (resolved_symbol.d >= 0)
1499
47.5k
    {
1500
47.5k
      int d = resolved_symbol.d;
1501
47.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1502
1.02k
        d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1503
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1504
      // It has nothing to do with alias.
1505
47.7k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1506
146
        d = tensor_blocks[d].ref - 1;
1507
47.5k
      if (tensor_arena->vt_tensors[d])
1508
47.5k
        continue;
1509
      // Assert original alias has no ofs. Otherwise our binding will be problematic.
1510
26
      
for (j = 0; 2
j < CCV_NNC_MAX_DIM_ALLOC;
j++24
)
1511
24
        { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0); }
1512
2
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1513
0
      {
1514
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1515
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1516
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1517
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1518
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1519
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1520
        // It is OK to be just as a whole smaller or equal to the binded one.
1521
0
        assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info));
1522
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1523
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1524
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1525
2
      } else {
1526
2
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1527
2
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1528
2
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1529
2
        tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1530
2
        tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1531
2
        tv->data = tensor_binds[i].tensor->data;
1532
2
        tv->dataof = tensor_binds[i].tensor->dataof;
1533
2
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1534
2
      }
1535
2
    }
1536
47.5k
  }
1537
  // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1538
  // Avoiding refs that actually is an alias.
1539
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1540
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1541
91.2k
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_blocks[i].ref61.3k
&&
!tensor_arena->vt_tensors[i]6.37k
&&
!tensor_blocks[i].alias_ref6.37k
)
1542
6.20k
    {
1543
6.20k
      int ref = tensor_blocks[i].ref - 1;
1544
6.20k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
tensor_blocks[ref].ref149
)
1545
1
        ref = tensor_blocks[ref].ref - 1;
1546
6.20k
      assert(tensor_arena->vt_tensors[ref]);
1547
6.20k
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1548
6.20k
    }
1549
  // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1550
6.22k
  if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1551
21
  {
1552
21
    assert(graph_prep->p);
1553
21
    const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1554
21
    const int p_idx = graph_prep->p_idx - 1;
1555
46
    for (i = 0; i < node->input_size; 
i++25
)
1556
25
    {
1557
25
      const int idx = node->inputs[i];
1558
25
      int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx) - 1;
1559
25
      assert(!tensor_blocks[block_ref].ref);
1560
25
      const int vt_ref = alloc_prep->vt_blocks[block_ref];
1561
25
      if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1562
18
        continue;
1563
25
      assert
(vt_ref >= 0)7
;
1564
7
      const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1565
7
      assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]));
1566
7
      assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]));
1567
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1568
      // previous layer, therefore, cannot really find the buffer ptr.
1569
7
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[block_ref]0
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1570
7
        ((graph_prep->dup_tensor_block_ref &&
1571
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 04
&&
1572
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref4
) ||
1573
7
         
!tensor_arena->buffers[buffer_ref].ptr3
))
1574
4
      {
1575
        // We haven't allocated anything for this yet.
1576
4
        assert(tensor_arena->vt_tensors[block_ref] == 0);
1577
4
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1578
4
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1579
4
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1580
4
      } else {
1581
3
        const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1582
3
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1583
3
        ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1584
3
      }
1585
7
    }
1586
21
  }
1587
  // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1588
  // This created the multi-view tensor to achieve that.
1589
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1590
91.2k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1591
10
    {
1592
10
      const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1593
      // Create phi multi-view.
1594
10
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1595
10
      const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1596
10
      const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1597
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1598
10
      ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1599
10
      ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1600
10
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1601
10
        intv,
1602
10
        outv,
1603
10
      }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI, mv);
1604
10
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1605
10
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1606
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1607
10
      ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1608
10
    }
1609
  // Now it is time to handle alias.
1610
36.3k
  for (i = 0; i < alloc_prep->block_size; 
i++30.1k
)
1611
30.1k
    if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1612
29.9k
    {
1613
29.9k
      const int block_ref = alloc_prep->blocks[i].block_ref;
1614
29.9k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]) && 
!tensor_arena->vt_tensors[block_ref]2.69k
)
1615
2.69k
      {
1616
        // Assigning out the tensor aliases.
1617
2.69k
        assert(tensor_symbol_info[block_ref].alias_ref);
1618
2.69k
        _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1619
2.69k
      }
1620
29.9k
    }
1621
  // Now assigning out the rest of alias refs.
1622
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1623
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1624
91.2k
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_blocks[i].alias_ref61.3k
&&
!tensor_arena->vt_tensors[i]167
)
1625
164
    {
1626
164
      int ref = tensor_blocks[i].alias_ref - 1;
1627
164
      assert(tensor_arena->vt_tensors[ref]);
1628
164
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1629
164
    }
1630
  // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1631
6.27k
  
for (i = 0; 6.22k
i < tensor_arena->sub_arena_size;
i++50
)
1632
50
    if (tensor_arena->sub_arenas[i])
1633
49
    {
1634
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1635
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1636
138
      for (j = 0; j < node->input_size; 
j++89
)
1637
89
      {
1638
89
        const int idx = node->inputs[j];
1639
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1640
89
        if (s_idx < 0)
1641
23
          continue;
1642
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1643
        // Only do the replacement if it is a multi-view tensor.
1644
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1645
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor) &&
!18
TENSOR_EXPECT_UNASSIGNED18
(tensor_blocks[idx]))
1646
18
        {
1647
          // It cannot be binded tensor.
1648
18
          assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]));
1649
18
          const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1650
18
          const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1651
18
          ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? 
sub_arena_out_tensors[idx]1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)17
;
1652
          // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1653
          // to this tensor.
1654
18
          if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor))
1655
6
          {
1656
6
            const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1657
6
            ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1658
6
            ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? 
vt_tensor1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)5
);
1659
6
            ref_tensor->alias_ref = is_sub_arena_out_tensor ? 
(uintptr_t)vt_tensor1
:
(uintptr_t)vt_pos5
;
1660
6
            ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1661
6
            ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0]) ? 
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)5
CCV_NNC_MULTIVIEW_DATA5
(multiview)[0]) :
CCV_NNC_MULTIVIEW_DATA1
(multiview)[0]1
);
1662
6
            while (CCV_IS_TENSOR_MULTIVIEW(tv))
1663
0
              tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]);
1664
6
            *ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1665
6
            ref_tensor->data = tv->data;
1666
6
            ref_tensor->dataof = tv->dataof;
1667
6
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1668
6
          } else
1669
12
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? 
vt_tensor0
: (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1670
18
        }
1671
66
      }
1672
49
    }
1673
  // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1674
  // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1675
  // when initialize case..of node, which will take the phi multi-view again.
1676
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1677
91.2k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1678
10
    {
1679
10
      assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]));
1680
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1681
10
      assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI);
1682
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1683
10
    }
1684
  // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1685
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1686
91.2k
    if (tensor_arena->vt_tensors[i])
1687
82.9k
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1688
  // Associate multiview tensors from sub arena to the parent.
1689
6.22k
  if (sub_arena_out_tensors)
1690
29
  {
1691
240
    for (i = 0; i < alloc_prep->block_size; 
i++211
)
1692
211
      if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1693
111
      {
1694
111
        const int block_ref = alloc_prep->blocks[i].block_ref;
1695
111
        if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))
1696
0
          continue;
1697
111
        int sub_arena_ref = block_ref;
1698
111
        if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1699
10
        {
1700
          // Assigning out the tensor aliases.
1701
10
          assert(tensor_symbol_info[block_ref].alias_ref);
1702
10
          const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1703
          // It referenced to is not an alias.
1704
10
          assert(tensor_arena->vt_tensors[alias_ref]);
1705
10
          sub_arena_ref = alias_ref;
1706
10
          if (!sub_arena_out_tensors[sub_arena_ref])
1707
3
            continue;
1708
10
        }
1709
108
        if (!sub_arena_out_tensors[sub_arena_ref])
1710
84
          continue;
1711
24
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref]) ? 
sub_arena_out_tensors[sub_arena_ref]23
:
(ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref1
);
1712
24
        assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1713
        // This is only possible if the vt_tensors is a phi node.
1714
24
        if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1715
0
        {
1716
          // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1717
0
          ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1718
0
          assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI);
1719
0
          assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]));
1720
0
          CCV_NNC_MULTIVIEW_DATA(phi)[1]->alias_ref = (uintptr_t)mv;
1721
0
          ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)[1]);
1722
24
        } else {
1723
24
          tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1724
24
          ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1725
24
        }
1726
24
      }
1727
29
  }
1728
  // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1729
  // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1730
  // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1731
  // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1732
  // to the output of assign_ref tensor.
1733
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1734
91.2k
    if (tensor_arena->vt_tensors[i] && 
tensor_symbol_info[i].assign_ref82.9k
)
1735
25
    {
1736
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1737
25
      ccv_nnc_tensor_t* assign_tensor;
1738
25
      if (sub_arena_out_tensors && 
sub_arena_out_tensors[assign_ref]3
)
1739
0
        assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref]) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1740
25
      else
1741
25
        assign_tensor = tensor_arena->vt_tensors[assign_ref];
1742
25
      ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1743
25
    }
1744
  // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1745
53.8k
  for (i = 0; i < tensor_bind_size; 
i++47.5k
)
1746
47.5k
  {
1747
47.5k
    assert(tensor_binds[i].tensor);
1748
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1749
47.5k
    if (resolved_symbol.d >= 0)
1750
47.5k
    {
1751
47.5k
      int d = resolved_symbol.d;
1752
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1753
      // It has nothing to do with alias.
1754
47.7k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref46.7k
)
1755
146
        d = tensor_blocks[d].ref - 1;
1756
      // Note we don't trace back on alias. This is intentional.
1757
47.5k
      assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1758
47.5k
    }
1759
47.5k
  }
1760
6.22k
  if (sub_arena_out_tensors)
1761
29
    ccfree(sub_arena_out_tensors);
1762
  // Rewire sub arena's tensor references.
1763
6.27k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1764
50
    if (tensor_arena->sub_arenas[i])
1765
49
    {
1766
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1767
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1768
138
      for (j = 0; j < node->input_size; 
j++89
)
1769
89
      {
1770
89
        const int idx = node->inputs[j];
1771
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1772
89
        if (s_idx < 0)
1773
23
          continue;
1774
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1775
        // Only do the replacement if it is a multi-view tensor.
1776
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1777
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor))
1778
18
        {
1779
          // This is binded tensor, bind it now.
1780
18
          if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx]))
1781
0
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1782
18
          else
1783
18
            _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1784
18
        }
1785
66
      }
1786
49
    }
1787
6.22k
  return tensor_arena;
1788
6.22k
}
1789
1790
static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1791
17
{
1792
17
  assert(graph);
1793
17
  if ((intptr_t)graph == tensor_arena->graph_ref)
1794
7
  {
1795
7
    assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size);
1796
7
    return tensor_arena->vt_tensors[pair_ref];
1797
7
  }
1798
10
  int i;
1799
13
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++3
)
1800
10
    if (tensor_arena->sub_arenas[i])
1801
10
    {
1802
10
      ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1803
10
      if (tensor)
1804
7
        return tensor;
1805
10
    }
1806
3
  return 0;
1807
10
}
1808
1809
static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1810
7
{
1811
7
  if (!CCV_IS_TENSOR_MULTIVIEW(tensor))
1812
5
    tensor->type |= CCV_TAPE_ALLOC;
1813
2
  else {
1814
2
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1815
2
    mv->type |= CCV_TAPE_ALLOC;
1816
2
    int i;
1817
5
    for (i = 0; i < mv->repeat + mv->kind; 
i++3
)
1818
3
      _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1819
2
  }
1820
7
}
1821
1822
static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1823
6.22k
{
1824
6.22k
  assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
1825
6.22k
  int i;
1826
97.5k
  for (i = 0; i < graph_prep->tensor_symbol_info_size; 
i++91.2k
)
1827
91.2k
  {
1828
91.2k
    if (graph_prep->tensor_symbol_info[i].pair_ref)
1829
7
    {
1830
7
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1831
      // No need to continue check this if it is from its pair.
1832
7
      continue;
1833
7
    }
1834
91.2k
    if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && 
tensor_arena->vt_tensors[i]7
)
1835
7
    {
1836
      // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1837
7
      if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i]))
1838
5
      {
1839
5
        const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1840
5
        if (vt_ref >= 0 &&
1841
5
          TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref]) == READ_ONLY)
1842
3
          continue;
1843
5
      }
1844
4
      _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1845
4
    }
1846
91.2k
  }
1847
6.27k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
1848
50
    if (graph_prep->sub_preps[i])
1849
49
      _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1850
6.22k
}
1851
1852
static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1853
128k
{
1854
128k
  int i, found = 0;
1855
  // Try to insert head.
1856
128k
  ccv_array_t* head = tensor_blocks.head;
1857
128k
  assert(head);
1858
130k
  
for (i = 0; 128k
i < head->rnum;)
1859
60.7k
  {
1860
60.7k
    const int head_idx = *(int*)ccv_array_get(head, i);
1861
60.7k
    if (head_idx == idx)
1862
118
    {
1863
118
      found = 1;
1864
118
      break;
1865
118
    }
1866
60.6k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1867
60.6k
    if (cell.i32 && 
cell.i32[0] > 041
)
1868
41
    {
1869
      /* If the current node is the parent of the head node, check if we found it or not. */
1870
      /* If not found, replace the current one. */
1871
41
      if (!found)
1872
41
      {
1873
41
        found = 1;
1874
41
        *(int*)ccv_array_get(head, i) = idx;
1875
41
      } else {
1876
        /* Remove the current one, change the rnum. */
1877
0
        if (i < head->rnum - 1)
1878
0
          *(int*)ccv_array_get(head, i) = *(int*)ccv_array_get(head, head->rnum - 1);
1879
0
        --head->rnum;
1880
0
        continue;
1881
0
      }
1882
60.6k
    } else {
1883
      // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1884
60.6k
      cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1885
60.6k
      if (cell.i32 && 
cell.i32[0] > 058.4k
)
1886
58.4k
      {
1887
58.4k
        found = 1;
1888
58.4k
        break;
1889
58.4k
      }
1890
60.6k
    }
1891
    /* Advancing i. */
1892
2.17k
    ++i;
1893
2.17k
  }
1894
  /* If not found, push this idx to the end of the array. */
1895
128k
  if (!found)
1896
70.1k
    ccv_array_push(head, &idx);
1897
  // Try to insert tail.
1898
128k
  found = 0;
1899
128k
  ccv_array_t* tail = tensor_blocks.tail;
1900
128k
  assert(tail);
1901
186k
  
for (i = 0; 128k
i < tail->rnum;)
1902
61.9k
  {
1903
61.9k
    const int tail_idx = *(int*)ccv_array_get(tail, i);
1904
61.9k
    if (tail_idx == idx)
1905
4.48k
    {
1906
4.48k
      found = 1;
1907
4.48k
      break;
1908
4.48k
    }
1909
57.4k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1910
57.4k
    if (cell.i32 && 
cell.i32[0] > 055.1k
)
1911
55.1k
    {
1912
      /* If the current node is the child of the tail node, check if we found it or not. */
1913
      /* If not found, replace the current one. */
1914
55.1k
      if (!found)
1915
53.9k
      {
1916
53.9k
        found = 1;
1917
53.9k
        *(int*)ccv_array_get(tail, i) = idx;
1918
53.9k
      } else {
1919
        /* Remove the current one, change the rnum. */
1920
1.13k
        *(int*)ccv_array_get(tail, i) = *(int*)ccv_array_get(tail, tail->rnum - 1);
1921
1.13k
        --tail->rnum;
1922
1.13k
        continue;
1923
1.13k
      }
1924
55.1k
    } else {
1925
      // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1926
2.37k
      cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1927
2.37k
      if (cell.i32 && 
cell.i32[0] > 0110
)
1928
110
      {
1929
110
        found = 1;
1930
110
        break;
1931
110
      }
1932
2.37k
    }
1933
    /* Advancing i. */
1934
56.2k
    ++i;
1935
56.2k
  }
1936
  /* If not found, push this idx to the end of the array. */
1937
128k
  if (!found)
1938
70.2k
    ccv_array_push(tail, &idx);
1939
128k
}
1940
1941
ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1942
7.00k
{
1943
7.00k
  if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1944
6.90k
  {
1945
6.90k
    assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size);
1946
6.90k
    ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1947
6.90k
    if (tensor && 
CCV_IS_TENSOR_MULTIVIEW6.90k
(tensor))
1948
11
    {
1949
11
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1950
22
      while (CCV_IS_TENSOR_MULTIVIEW(mv))
1951
11
        mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? 
mv->it1
:
CCV_NNC_MULTIVIEW_DATA10
(mv)[0]10
);
1952
11
      return (ccv_nnc_tensor_t*)mv;
1953
11
    }
1954
6.89k
    return tensor;
1955
6.90k
  }
1956
100
  int i;
1957
123
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++23
)
1958
99
    if (tensor_arena->sub_arenas[i])
1959
99
    {
1960
99
      ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1961
99
      if (tensor)
1962
76
        return tensor;
1963
99
    }
1964
24
  return 0;
1965
100
}
1966
1967
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1968
66.6k
{
1969
66.6k
  if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1970
66.6k
  {
1971
66.6k
    assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size);
1972
66.6k
    return graph_exec_arena->graph_execs[symbol.d];
1973
66.6k
  }
1974
7
  int i;
1975
9
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++2
)
1976
7
    if (graph_exec_arena->sub_arenas[i])
1977
7
    {
1978
7
      ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1979
7
      if (!CCV_NO_GRAPH_EXEC(exec))
1980
5
        return exec;
1981
7
    }
1982
2
  return (ccv_nnc_graph_exec_t){}; // 0.
1983
7
}
1984
1985
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1986
9
{
1987
9
  return graph_exec_arena->source;
1988
9
}
1989
1990
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1991
9
{
1992
9
  return graph_exec_arena->destination;
1993
9
}
1994
1995
// Check whether the head is the beginning of this block.
1996
static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1997
50
{
1998
50
  assert(tensor_block->head);
1999
50
  return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0) == head_node);
2000
50
}
2001
2002
// Check whether the tail is the end of this block.
2003
static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2004
39
{
2005
39
  assert(tensor_block->tail);
2006
39
  return (tensor_block->tail->rnum == 1 && 
*(int*)36
ccv_array_get36
(tensor_block->tail, 0) == tail_node);
2007
39
}
2008
2009
// Make two tensor blocks one. Return 1 if that happened.
2010
static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2011
6.69k
{
2012
  // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2013
6.69k
  if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]) &&
2014
6.69k
    
(6.66k
!6.66k
TENSOR_IS_UNFOLDABLE_AS_OUTPUT6.66k
(tensor_blocks[p_ref_1]) ||
tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 118
) &&
2015
6.69k
    
tensor_blocks[p_ref_0].tail->rnum == 16.64k
&&
2016
6.69k
    
tensor_blocks[p_ref_1].head->rnum == 16.64k
&&
2017
6.69k
    
tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type6.64k
&& // Must be the same type.
2018
6.69k
    
*(int*)6.63k
ccv_array_get6.63k
(tensor_blocks[p_ref_0].tail, 0) == *(int*)
ccv_array_get6.63k
(tensor_blocks[p_ref_1].head, 0))
2019
6.38k
  {
2020
    // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2021
6.38k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]));
2022
6.38k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]));
2023
6.38k
    ccv_array_free(tensor_blocks[p_ref_0].tail);
2024
6.38k
    tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2025
6.38k
    if (tensor_blocks[p_ref_1].p_refs[0])
2026
14
    {
2027
14
      assert(tensor_blocks[p_ref_1].p_refs[1] == 0); // It simply cannot have more than one p_refs, otherwise we cannot merge.
2028
14
      if (!tensor_blocks[p_ref_0].p_refs[0])
2029
10
        tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2030
4
      else
2031
4
        tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2032
14
    }
2033
6.38k
    tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2034
6.38k
    TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]));
2035
6.38k
    ccv_array_free(tensor_blocks[p_ref_1].head);
2036
6.38k
    if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1]))
2037
16
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]);
2038
    // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2039
6.38k
    TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1]);
2040
6.38k
    tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2041
6.38k
    if (!tensor_blocks[p_ref_0].r_refs)
2042
6.20k
      tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2043
6.38k
    ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2044
6.38k
    tensor_blocks[p_ref_1].size = 0;
2045
6.38k
    tensor_blocks[p_ref_1].head = 0;
2046
6.38k
    tensor_blocks[p_ref_1].tail = 0;
2047
6.38k
    return 1;
2048
6.38k
  }
2049
312
  return 0;
2050
6.69k
}
2051
2052
static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2053
6.23k
{
2054
6.23k
  int i, j, k;
2055
  // Generate exec dependencies (or, in other words, partial ordering of executions).
2056
6.23k
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2057
6.23k
  int* buf = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2058
6.23k
  int buf_size;
2059
6.23k
  if (p_node_info)
2060
62
    { assert(output_size == 0); }
2061
6.23k
#define for_block(x, val) \
2062
212k
  do { \
2063
212k
    if (((int32_t*)val)[0] > 0) \
2064
212k
    { \
2065
212k
      buf[buf_size * 2] = x; \
2066
212k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2067
212k
      ++buf_size; \
2068
212k
    } \
2069
212k
  } while (0)
2070
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
2071
32.1k
    buf_size = 0; /* save all its parent deps to this buffer */
2072
32.1k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2073
32.1k
    if (vector)
2074
212k
      
CCV_SPARSE_VECTOR_FOREACH25.6k
(exec_dep, vector, for_block);
2075
32.1k
    if (!node->outgoings)
2076
6.90k
      continue;
2077
53.3k
    
for (i = 0; 25.2k
i < node->outgoings->rnum;
i++28.0k
)
2078
28.0k
    {
2079
28.0k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
2080
28.0k
      const int32_t one = 1;
2081
28.0k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2082
      /* If not found, set, if the current node is the destination node, no need 
2083
       * set itself as parent of subsequent nodes because its terminal nature. */
2084
28.0k
      if (!cell.i32 || 
cell.i32[0] == 00
)
2085
28.0k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2086
28.0k
      if (buf_size > 0)
2087
22.6k
      {
2088
22.6k
        ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2089
22.6k
        assert(vector);
2090
257k
        
for (j = 0; 22.6k
j < buf_size;
j++234k
) /* set with all idx's dependencies as well */
2091
234k
        {
2092
234k
          ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2093
          /* If not found, set */
2094
234k
          if (!cell.i32 || 
cell.i32[0] == 030.8k
)
2095
203k
            ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2096
30.8k
          else {
2097
            /* Otherwise, set to the longest one */
2098
30.8k
            int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
2099
30.8k
            ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2100
30.8k
          }
2101
234k
        }
2102
22.6k
      }
2103
28.0k
    }
2104
25.2k
  } ccv_nnc_graph_visit_endfor
2105
6.23k
#undef for_block
2106
6.23k
  ccfree(buf);
2107
  // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2108
6.23k
  const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2109
6.23k
  ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2110
  // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2111
  // happens that I have to loop through all relevant node to find out if one is used or not.
2112
97.6k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++91.4k
)
2113
91.4k
    tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2114
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2115
123k
    for (i = 0; i < node->input_size; 
i++90.8k
)
2116
90.8k
      if (node->inputs[i] >= 0)
2117
64.3k
      {
2118
64.3k
        tensor_blocks[node->inputs[i]].flags = 0;
2119
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2120
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2121
64.3k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
2122
64.3k
          
(56.3k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD56.3k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD56.3k
))
2123
18
          tensor_blocks[node->inputs[i]].pin_mem = 1;
2124
64.3k
      }
2125
82.5k
    for (i = 0; i < node->output_size; 
i++50.3k
)
2126
50.3k
      if (node->outputs[i] >= 0)
2127
41.4k
      {
2128
41.4k
        tensor_blocks[node->outputs[i]].flags = 0;
2129
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2130
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2131
41.4k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
2132
41.4k
          
(36.0k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD36.0k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD36.0k
))
2133
16
          tensor_blocks[node->outputs[i]].pin_mem = 1;
2134
41.4k
      }
2135
32.1k
  } ccv_nnc_graph_visit_endfor
2136
6.23k
  if (p_node_info)
2137
62
  {
2138
62
    assert(p_tensor_symbol_info);
2139
    // Mark it as used if it is used in either input or output.
2140
165
    
for (i = 0; 62
i < p_node_info->input_size;
i++103
)
2141
103
      if (p_node_info->inputs[i] >= 0)
2142
103
      {
2143
103
        const int d = p_node_info->inputs[i];
2144
103
        if (p_tensor_symbol_info[d].s_ref && 
p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx101
)
2145
92
        {
2146
92
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
2147
92
          if (dd >= 0) // If this exists in this sub-graph, great.
2148
80
            tensor_blocks[dd].flags = 0;
2149
92
        }
2150
103
      }
2151
132
    for (i = 0; i < p_node_info->output_size; 
i++70
)
2152
70
      if (p_node_info->outputs[i] >= 0)
2153
70
      {
2154
70
        const int d = p_node_info->outputs[i];
2155
70
        if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2156
70
        {
2157
70
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
2158
70
          if (dd >= 0) // If this exists in this sub-graph, great.
2159
70
            tensor_blocks[dd].flags = 0;
2160
70
        }
2161
70
      }
2162
62
  }
2163
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2164
91.4k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
2165
70.6k
    {
2166
      // Check no tensor info is auto now.
2167
70.6k
      assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info));
2168
      // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2169
      // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2170
      // fold to).
2171
70.6k
      if (tensor_symbol_info[i].assign_ref)
2172
40
      {
2173
        // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2174
        // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2175
        // it kept its own representation, which is not the case for output).
2176
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
2177
40
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2178
        // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2179
40
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref]);
2180
        // It also cannot be folded as output (except i), because we need to keep its own representation.
2181
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref]);
2182
40
        assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0);
2183
40
        tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2184
63
        for (j = 0; j < unroll_count; 
j++23
)
2185
23
        {
2186
23
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
2187
23
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
2188
23
        }
2189
40
        if (tensor_blocks[assign_ref].bypass_ref)
2190
4
        {
2191
          // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2192
4
          tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2193
4
          const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2194
4
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref]);
2195
4
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref]);
2196
          // On the other hand, it can be folded into the except_ref for the bypass_ref.
2197
4
          tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2198
4
          if (dup_tensor_from_ref)
2199
2
          {
2200
2
            const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2201
2
            if (bypass_from_ref >= 0)
2202
2
            {
2203
2
              TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref]);
2204
2
              TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref]);
2205
2
              assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref);
2206
2
              for (j = 0; j < unroll_count - 1; 
j++0
)
2207
0
              {
2208
                // Mark every incarnation as unfold-able.
2209
0
                TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
2210
0
                TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
2211
0
              }
2212
2
            }
2213
2
          }
2214
4
        }
2215
40
      }
2216
70.6k
    }
2217
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2218
91.4k
  {
2219
    // If it has a pair reference, we don't need to allocate this tensor at all,
2220
    // set it to be unassigned.
2221
91.4k
    if (tensor_symbol_info[i].pair_ref)
2222
15
      TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i]);
2223
    // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2224
91.4k
    else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2225
7
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2226
7
      TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
2227
      // For this case, there is no exception.
2228
7
      tensor_blocks[i].unfoldable_except_ref = 0;
2229
91.3k
    } else if (tensor_symbol_info[i].p_ref) {
2230
119
      assert(p_node_info);
2231
119
      const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2232
      // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2233
119
      if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2234
        // TODO: This check can be lifted if we can fold in the parent graph.
2235
48
        if (-1 == p_ref_is_in_or_out)
2236
20
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2237
119
      if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2238
68
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2239
119
    }
2240
91.4k
  }
2241
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2242
91.4k
  {
2243
91.4k
    if (tensor_symbol_info[i].alias_ref)
2244
3.26k
    {
2245
3.26k
      const int ref = tensor_symbol_info[i].alias_ref - 1;
2246
      // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2247
3.26k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
!1.58k
TENSOR_EXPECT_UNASSIGNED1.58k
(tensor_blocks[i]))
2248
1.04k
        tensor_blocks[ref].flags = 0;
2249
      // An alias cannot ref to another alias.
2250
3.26k
      assert(!tensor_symbol_info[ref].alias_ref);
2251
3.26k
      tensor_blocks[i].flags = ALIAS;
2252
3.26k
      tensor_blocks[i].ref = ref + 1; // Assign the ref.
2253
3.26k
      if (!tensor_blocks[ref].r_refs)
2254
3.22k
        tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2255
3.26k
      ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2256
3.26k
    }
2257
91.4k
  }
2258
  // Scan again and if the ref is not assigned, mark the alias not assigned.
2259
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2260
91.4k
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
2261
3.26k
    {
2262
3.26k
      const int ref = tensor_blocks[i].ref - 1;
2263
3.26k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]))
2264
539
      {
2265
        // Mark this as unassigned.
2266
539
        tensor_blocks[i].flags = UNASSIGNED;
2267
539
        tensor_blocks[i].ref = 0;
2268
539
      }
2269
3.26k
    }
2270
97.6k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++91.4k
)
2271
91.4k
  {
2272
    // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2273
91.4k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
2274
68.9k
    {
2275
68.9k
      tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2276
68.9k
      tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2277
      // Cache tensor size (align to 16 bytes).
2278
68.9k
      tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2279
68.9k
    }
2280
    // If there is a p_ref, add the one to the p_refs list.
2281
91.4k
    if (tensor_symbol_info[i].p_ref)
2282
128
      tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2283
91.4k
  }
2284
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2285
123k
    for (i = 0; i < node->input_size; 
i++90.8k
)
2286
90.8k
    {
2287
90.8k
      int d = node->inputs[i];
2288
90.8k
      if (d < 0)
2289
26.5k
        continue;
2290
64.3k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2291
1.57k
        d = tensor_symbol_info[d].alias_ref - 1;
2292
64.3k
      tensor_blocks[d].flags |= READ_ONLY;
2293
64.3k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2294
15
        continue;
2295
64.3k
      assert
(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))64.3k
;
2296
      /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2297
       * from the very beginning of the graph life-cycle and ends here. */
2298
64.3k
      if (tensor_blocks[d].head->rnum == 0 && 
!27.5k
TENSOR_REQUIRE_INIT27.5k
(tensor_symbol_info[d].flags))
2299
27.4k
      {
2300
87.2k
        for (j = 0; j < source_size; 
j++59.7k
)
2301
59.7k
        {
2302
          // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2303
59.7k
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2304
59.7k
          if (cell.i32 && 
cell.i32[0] > 022.7k
)
2305
22.7k
            _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2306
59.7k
        }
2307
        /* If this is a read-only (based on SSA, if first encountered as read), and this is
2308
         * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2309
         * loop, however, in that case, you need to prevent read-only gets reused for the
2310
         * output tensor, which is not obvious how to implement correctly), and it is not
2311
         * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2312
         * of memory anyway (because on second loop, we want to read the same value out).
2313
         * Mark it to the end of the graph. */
2314
27.4k
        if (p_node_info && 
!tensor_symbol_info[d].assign_ref146
)
2315
210
          
for (j = 0; 105
j < destination_size;
j++105
)
2316
105
          {
2317
            // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2318
105
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2319
105
            if (cell.i32 && 
cell.i32[0] > 065
)
2320
65
              _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2321
105
          }
2322
27.4k
      }
2323
64.3k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2324
64.3k
    }
2325
82.5k
    
for (i = 0; 32.1k
i < node->output_size;
i++50.3k
)
2326
50.3k
    {
2327
50.3k
      int d = node->outputs[i];
2328
50.3k
      if (d < 0)
2329
8.91k
        continue;
2330
41.4k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2331
1.36k
        d = tensor_symbol_info[d].alias_ref - 1;
2332
41.4k
      tensor_blocks[d].flags |= WRITE_ONLY;
2333
41.4k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2334
0
        continue;
2335
41.4k
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2336
41.4k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2337
41.4k
    }
2338
32.1k
  } ccv_nnc_graph_visit_endfor
2339
  // For any assign_ref, its life-time kept until the end and wrap over.
2340
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2341
    // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2342
    // that "somewhere else" need to keep its life-time til the end.
2343
91.4k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) &&
2344
91.4k
      
p_node_info68.9k
&&
tensor_symbol_info[i].assign_ref282
)
2345
42
    {
2346
42
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2347
84
      for (j = 0; j < destination_size; 
j++42
)
2348
42
      {
2349
        // This logic is to be more conservative about which destination we add to.
2350
        // As of now, if we add everything, it is fine most likely. However, it may
2351
        // cause issues in the future to do so naively. Thus, instead, we only add
2352
        // the destination to it iff either the tensor is not used at all, or, the
2353
        // destination is on the same stream as of the tensor block some way.
2354
42
        int flag = !tensor_blocks[assign_ref].tail;
2355
83
        for (k = 0; !flag && 
k < tensor_blocks[assign_ref].tail->rnum73
;
k++41
)
2356
41
        {
2357
41
          const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k);
2358
41
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2359
41
          flag = (cell.i32 && 
cell.i32[0] > 010
);
2360
41
        }
2361
42
        if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2362
10
          _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2363
42
      }
2364
42
    }
2365
6.33k
  for (i = 0; i < output_size; 
i++99
)
2366
99
  {
2367
99
    assert(outputs[i].graph == symbolic_graph);
2368
99
    int d = outputs[i].d;
2369
99
    if (d < 0)
2370
0
      continue;
2371
99
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2372
0
      d = tensor_symbol_info[d].alias_ref - 1;
2373
99
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2374
0
      continue;
2375
99
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2376
361
    
for (j = 0; 99
j < destination_size;
j++262
)
2377
262
    {
2378
262
      int flag = !tensor_blocks[d].tail;
2379
524
      for (k = 0; !flag && 
k < tensor_blocks[d].tail->rnum492
;
k++262
)
2380
262
      {
2381
262
        const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
2382
262
        const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2383
262
        flag = (cell.i32 && 
cell.i32[0] > 032
);
2384
262
      }
2385
262
      if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2386
32
        _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2387
262
    }
2388
99
  }
2389
  // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2390
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2391
32.1k
    int x, y;
2392
123k
    for (x = 0; x < node->input_size; 
x++90.8k
)
2393
260k
      
for (y = 0; 90.8k
y < node->output_size;
y++169k
)
2394
        /* Some operations enforces some tensors to be the same for inputs / outputs. */
2395
169k
        if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2396
180
        {
2397
          // If both unassigned, it is fine.
2398
180
          if (node->inputs[x] < 0 && 
node->outputs[y] < 00
)
2399
0
            continue;
2400
180
          int ref = node->inputs[x];
2401
180
          assert(ref >= 0);
2402
180
          while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref0
)
2403
0
            ref = tensor_blocks[ref].ref - 1;
2404
180
          const int node_output_y = node->outputs[y];
2405
180
          assert(node_output_y >= 0);
2406
          // If both are not computable, it is fine, we don't need to enforce.
2407
180
          if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2408
180
            
!0
TENSOR_EXPECT_COMPUTABLE0
(tensor_blocks[node_output_y]))
2409
0
            continue;
2410
          // Otherwise, enforce and error out if failed.
2411
180
          if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2412
0
            { assert(0 && "cannot enforce inplace for the two tensors"); }
2413
180
        }
2414
32.1k
  } ccv_nnc_graph_visit_endfor
2415
  // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2416
  // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2417
  // that is not enforced in-place (because the tensor enforced in-place will be different than the
2418
  // binding one).
2419
53.8k
  
for (i = 0; 6.23k
i < tensor_bind_size;
i++47.5k
)
2420
47.5k
  {
2421
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2422
    // If there is a tensor binded, then it is unassigned.
2423
47.5k
    if (resolved_symbol.d >= 0)
2424
47.5k
    {
2425
47.5k
      int d = resolved_symbol.d;
2426
      // I cannot assert too much at this moment.
2427
47.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2428
1.02k
        d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2429
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
2430
      // It has nothing to do with alias.
2431
47.7k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref12.5k
)
2432
146
        d = tensor_blocks[d].ref - 1;
2433
      // Doesn't work if this is a loop carrying variable.
2434
47.5k
      assert(!tensor_symbol_info[d].assign_ref);
2435
47.5k
      tensor_blocks[d].flags = UNASSIGNED;
2436
47.5k
      tensor_blocks[d].ref = 0; // No need to have ref as well.
2437
47.5k
    }
2438
47.5k
  }
2439
  // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2440
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2441
32.1k
    int x, y;
2442
123k
    for (x = 0; x < node->input_size; 
x++90.8k
)
2443
90.8k
    {
2444
      /* If the input is not assigned, it can be referenced, find the referenced one */
2445
90.8k
      int ref = node->inputs[x];
2446
90.8k
      if (ref < 0)
2447
26.5k
        continue;
2448
64.3k
      const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2449
71.6k
      while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref38.3k
)
2450
7.27k
        ref = tensor_blocks[ref].ref - 1;
2451
64.3k
      assert(tensor_blocks[ref].ref == 0);
2452
64.3k
      if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2453
64.3k
        
tensor_blocks[ref].tail->rnum == 133.2k
)
2454
33.0k
      {
2455
86.7k
        for (y = 0; y < node->output_size; 
y++53.6k
)
2456
          /* Only proceed if the input symbol is different from the output symbol, */
2457
          /* and the input symbol meets the output symbol exactly at the same spot. */
2458
53.6k
          if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2459
53.6k
            
node->outputs[y] >= 013.4k
&&
2460
53.6k
            
ref != node->outputs[y]13.4k
&&
2461
53.6k
            
TENSOR_EXPECT_COMPUTABLE13.4k
(tensor_blocks[node->outputs[y]]))
2462
6.52k
          {
2463
6.52k
            const int node_output_y = node->outputs[y];
2464
6.52k
            const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2465
            /* If dimension matches perfectly, then we can assign y_symbol to x.
2466
             * If both of them are aliases, making sure their origin matches in size too. */
2467
6.52k
            if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
2468
6.51k
            {
2469
6.51k
              _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2470
              // This refers to an alias itself, now mark it and will be processed later.
2471
6.51k
              if (ref != node->inputs[x])
2472
290
                tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2473
6.51k
            }
2474
6.52k
          }
2475
33.0k
      }
2476
64.3k
    }
2477
32.1k
  } ccv_nnc_graph_visit_endfor
2478
  // Specifically handle the bypass. This need to be done after the first pass.
2479
  // I need to extend the bypass life-time to the same as the one I am going with.
2480
  // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2481
6.23k
  ccv_nnc_tensor_block_t empty_block = {};
2482
6.23k
  empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2483
6.23k
  empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2484
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2485
32.1k
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2486
13
    {
2487
13
      int can_bypass = 1;
2488
28
      for (i = 0; can_bypass && 
i < node->output_size25
;
i++15
)
2489
15
      {
2490
15
        int d = node->outputs[i];
2491
15
        if (d < 0)
2492
0
          continue;
2493
15
        if (!tensor_blocks[d].bypass_ref)
2494
2
          continue;
2495
13
        while (tensor_blocks[d].ref)
2496
0
          d = tensor_blocks[d].ref - 1;
2497
13
        int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2498
14
        while (tensor_blocks[bypass_ref].ref)
2499
1
          bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2500
        // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2501
13
        if (!tensor_symbol_info[bypass_ref].assign_ref && 
!tensor_symbol_info[bypass_ref].r_assign_ref10
)
2502
10
          continue;
2503
3
        ccv_array_clear(empty_block.head);
2504
6
        for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; 
j++3
)
2505
3
          ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j));
2506
3
        ccv_array_clear(empty_block.tail);
2507
6
        for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; 
j++3
)
2508
3
          ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j));
2509
6
        for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++3
)
2510
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), empty_block);
2511
6
        for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++3
)
2512
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), empty_block);
2513
        // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2514
3
        assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref));
2515
3
        int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : 
tensor_symbol_info[bypass_ref].r_assign_ref - 10
;
2516
3
        while (tensor_blocks[b_ref].ref)
2517
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2518
3
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2519
3
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2520
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2521
        // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2522
3
        can_bypass = can_bypass && (a_hop_b || b_hop_a);
2523
3
      }
2524
13
      if (can_bypass)
2525
10
      {
2526
22
        for (i = 0; i < node->output_size; 
i++12
)
2527
12
        {
2528
12
          int d = node->outputs[i];
2529
12
          if (d < 0)
2530
0
            continue;
2531
12
          if (!tensor_blocks[d].bypass_ref)
2532
2
            continue;
2533
10
          while (tensor_blocks[d].ref)
2534
0
            d = tensor_blocks[d].ref - 1;
2535
10
          int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2536
10
          while (tensor_blocks[bypass_ref].ref)
2537
0
            bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2538
          // The bypass_ref can extend its life-time.
2539
20
          for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++10
)
2540
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), tensor_blocks[bypass_ref]);
2541
20
          for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++10
)
2542
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), tensor_blocks[bypass_ref]);
2543
10
        }
2544
10
      } else {
2545
6
        for (i = 0; i < node->output_size; 
i++3
)
2546
3
          tensor_blocks[node->outputs[i]].bypass_ref = 0;
2547
3
        const int exec_idx = (dup_exec_from_ref) ? 
dup_exec_from_ref[idx]1
:
idx2
;
2548
        // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2549
3
        exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2550
3
      }
2551
13
    }
2552
32.1k
  } ccv_nnc_graph_visit_endfor
2553
6.23k
  ccv_array_free(empty_block.head);
2554
6.23k
  ccv_array_free(empty_block.tail);
2555
6.23k
  *r_exec_dep = exec_dep;
2556
6.23k
  *r_tensor_blocks = tensor_blocks;
2557
6.23k
}
2558
2559
static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2560
33
{
2561
33
  if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
cmd.cmd == CCV_NNC_GRAPH_BACKWARD30
)
2562
3
  {
2563
3
    ccv_nnc_cmd_t retval = cmd;
2564
3
    retval.cmd = CCV_NNC_NOOP;
2565
3
    return retval;
2566
3
  }
2567
30
  return cmd;
2568
33
}
2569
2570
static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2571
102
{
2572
102
  if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2573
47
  {
2574
47
    if (tensor_symbol_info[input].alias_ref)
2575
18
    {
2576
18
      const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2577
18
      assert(tensor_symbol_info[alias_ref].alias_ref == 0);
2578
18
      ccv_nnc_tensor_symbol_t tensor_symbol = {};
2579
18
      if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2580
6
      {
2581
6
        tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2582
6
        if (tensor_symbol_info[alias_ref].pair_ref)
2583
0
          ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2584
0
            .d = tensor_symbol_info[alias_ref].pair_ref - 1,
2585
0
            .graph = dup_graph->pair
2586
0
          });
2587
6
        ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2588
6
        dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2589
12
      } else {
2590
12
        tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2591
12
        tensor_symbol.graph = dup_graph;
2592
12
      }
2593
18
      ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2594
18
      if (tensor_symbol_info[input].pair_ref)
2595
0
        ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2596
0
          .d = tensor_symbol_info[input].pair_ref - 1,
2597
0
          .graph = dup_graph->pair
2598
0
        });
2599
18
      ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2600
18
      dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2601
29
    } else {
2602
29
      ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2603
29
      if (tensor_symbol_info[input].pair_ref)
2604
4
        ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2605
4
          .d = tensor_symbol_info[input].pair_ref - 1,
2606
4
          .graph = dup_graph->pair
2607
4
        });
2608
29
      ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2609
29
      dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2610
29
    }
2611
47
    if (tensor_symbol_info[input].bypass_ref)
2612
2
    {
2613
2
      const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2614
2
      assert(dup_bypass_ref >= 0);
2615
2
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count]);
2616
2
      symbol_info->bypass_ref = dup_bypass_ref + 1;
2617
2
    }
2618
47
  }
2619
102
  return (ccv_nnc_tensor_symbol_t) {
2620
102
    .d = dup_tensor_block_ref[input * unroll_count],
2621
102
    .graph = dup_graph,
2622
102
  };
2623
102
}
2624
2625
static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2626
72
{
2627
72
  int i;
2628
72
  if (dup_exec_ref[idx * unroll_count] < 0)
2629
44
  {
2630
    // Input has to come before output, because output could has a bypass reference to the input.
2631
116
    for (i = 0; i < node->input_size; 
i++72
)
2632
72
      max_inputs[i] = (node->inputs[i] >= 0) ? 
_ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i])71
:
(ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph }1
;
2633
75
    for (i = 0; i < node->output_size; 
i++31
)
2634
31
      max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : 
(ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph }0
;
2635
44
    ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2636
44
    dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2637
44
  }
2638
72
  return (ccv_nnc_graph_exec_symbol_t) {
2639
72
    .d = dup_exec_ref[idx * unroll_count],
2640
72
    .graph = dup_graph,
2641
72
  };
2642
72
}
2643
2644
static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2645
6.23k
{
2646
6.23k
  int i;
2647
97.7k
  for (i = 0; i < tensor_block_size; 
i++91.5k
)
2648
91.5k
  {
2649
91.5k
    if (tensor_blocks[i].head)
2650
62.6k
      ccv_array_free(tensor_blocks[i].head);
2651
91.5k
    if (tensor_blocks[i].tail)
2652
62.6k
      ccv_array_free(tensor_blocks[i].tail);
2653
91.5k
    if (tensor_blocks[i].r_refs)
2654
9.42k
      ccv_array_free(tensor_blocks[i].r_refs);
2655
91.5k
    if (tensor_blocks[i].dup_p_refs)
2656
22
      ccv_array_free(tensor_blocks[i].dup_p_refs);
2657
91.5k
  }
2658
6.23k
  ccfree(tensor_blocks);
2659
6.23k
}
2660
2661
// Find tensors that cannot be solved by co-allocating to the same location.
2662
static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2663
21
{
2664
21
  int i, j, unroll_count = 0;
2665
131
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++110
)
2666
110
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_symbol_info[i].assign_ref90
)
2667
25
    {
2668
      // This is is a parameter, thus, it has to be either an alias or used.
2669
25
      assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]));
2670
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2671
      // The parameter it assign to has to be either an alias or used.
2672
25
      assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]));
2673
      // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2674
      // If it is the same, we are good, no need to extend.
2675
25
      int a_ref = i;
2676
25
      while (tensor_blocks[a_ref].ref)
2677
0
        a_ref = tensor_blocks[a_ref].ref - 1;
2678
25
      int b_ref = assign_ref;
2679
31
      while (tensor_blocks[b_ref].ref)
2680
6
        b_ref = tensor_blocks[b_ref].ref - 1;
2681
25
      if (a_ref != b_ref)
2682
19
      {
2683
        // If any of the b's head is deterministically later than a's tail
2684
        // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2685
19
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2686
19
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2687
        // It cannot be that both i can hop to j can j can hop to i.
2688
19
        assert(!(a_hop_b > 0 && b_hop_a > 0));
2689
        // Can it be folded
2690
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2691
19
        if (a_hop_b || 
b_hop_a16
)
2692
3
        {
2693
3
          tensor_blocks[a_ref].companion_ref = b_ref + 1;
2694
3
          tensor_blocks[b_ref].companion_ref = a_ref + 1;
2695
3
          continue;
2696
3
        }
2697
16
        int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2698
20
        for (j = 0; c_ref >= 0; 
j++4
)
2699
4
        {
2700
4
          while (tensor_blocks[c_ref].ref)
2701
0
            c_ref = tensor_blocks[c_ref].ref - 1;
2702
4
          c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2703
4
        }
2704
16
        unroll_count = ccv_max(unroll_count, j + 1);
2705
16
      }
2706
25
    }
2707
  // Reset companion_ref if need to unroll.
2708
21
  if (unroll_count)
2709
91
    
for (j = 0; 13
j < symbolic_graph->tensor_symbol_info->rnum;
j++78
)
2710
78
      tensor_blocks[j].companion_ref = 0;
2711
21
  return unroll_count;
2712
21
}
2713
2714
static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2715
13
{
2716
13
  int i, j, n;
2717
  // The inout exec nodes, these are the nodes we are going to extend.
2718
13
  uint8_t* inout = (uint8_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2719
13
  int max_input_size = 0;
2720
13
  int max_output_size = 0;
2721
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2722
35
  {
2723
35
    max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size);
2724
35
    max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size);
2725
35
  }
2726
13
  ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)];
2727
13
  ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)];
2728
  // Doing graph expansion
2729
  // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2730
13
  assert(dup_graph->exec_symbol_info->rnum > 0);
2731
13
  assert(dup_graph->tensor_symbol_info->rnum > 0);
2732
88
#define INCOMING_NODE (1)
2733
28
#define OUTGOING_NODE (2)
2734
  // Unroll the graph n times.
2735
29
  
for (n = 0; 13
n < unroll_count;
n++16
)
2736
16
  {
2737
16
    int* const dup_exec_ref = r_dup_exec_ref + n;
2738
16
    const int* const prev_dup_tensor_block_ref = n > 0 ? 
r_dup_tensor_block_ref + (n - 1)3
:
013
;
2739
16
    int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2740
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2741
46
      dup_exec_ref[i * unroll_count] = -1;
2742
131
    for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++115
)
2743
115
    {
2744
      // If there is a assign_ref, that means I don't need to dup the tensor.
2745
115
      if (tensor_symbol_info[i].assign_ref)
2746
25
      {
2747
25
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2748
25
        dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? 
prev_dup_tensor_block_ref[assign_ref * unroll_count]8
:
assign_ref17
;
2749
90
      } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && 
TENSOR_READ_WRITE52
(tensor_blocks[i]) == READ_ONLY52
)
2750
      // If this is a read-only tensor block, no need to duplicate because the value never changes
2751
      // (note we handled assign_ref first), therefore, no need to generate duplicate.
2752
26
        dup_tensor_block_ref[i * unroll_count] = i;
2753
64
      else
2754
64
        dup_tensor_block_ref[i * unroll_count] = -1;
2755
115
    }
2756
    // Go through the original graph, make copies of the node if it is inout.
2757
44
    ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2758
44
      ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2759
44
      inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2760
44
      if (!node->outgoings)
2761
16
        continue;
2762
56
      
for (i = 0; 28
i < node->outgoings->rnum;
i++28
)
2763
28
      {
2764
28
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
2765
28
        inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2766
28
        ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2767
28
        ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2768
28
      }
2769
28
    } ccv_nnc_graph_visit_endfor
2770
    // Check the visitor are all marked as either incoming or outgoing.
2771
16
    const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2772
16
    const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2773
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2774
46
    {
2775
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2776
2
        continue;
2777
46
      assert
((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))44
;
2778
      // If this is pure incoming nodes, then I need to concat this one with all original destination node
2779
44
      if (inout[i] == INCOMING_NODE)
2780
32
        
for (j = 0; 16
j < dup_destination_size;
j++16
)
2781
16
        {
2782
16
          ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2783
16
            .d = dup_destinations[j].d,
2784
16
            .graph = dup_graph,
2785
16
          }, (ccv_nnc_graph_exec_symbol_t) {
2786
16
            .d = dup_exec_ref[i * unroll_count],
2787
16
            .graph = dup_graph,
2788
16
          });
2789
16
        }
2790
44
    }
2791
16
    if (dup_graph->destinations)
2792
16
      ccv_array_clear(dup_graph->destinations);
2793
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2794
46
    {
2795
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2796
2
        continue;
2797
44
      const int d = dup_exec_ref[i * unroll_count];
2798
44
      ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d);
2799
      // If this has no outgoing node, add to the destination.
2800
44
      if (!exec_symbol_info->outgoings || 
exec_symbol_info->outgoings->rnum == 028
)
2801
16
        ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2802
16
          .graph = dup_graph,
2803
16
          .d = d,
2804
16
        });
2805
44
    }
2806
16
  }
2807
13
#undef INCOMING_NODE
2808
13
#undef OUTGOING_NODE
2809
13
  ccfree(inout);
2810
13
}
2811
2812
static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2813
13
{
2814
13
  int i;
2815
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2816
    // Now can assign them (The dup) as companion.
2817
    // Get to the last one, which we will wrap over.
2818
78
    if (dup_tensor_symbol_info[i].assign_ref)
2819
17
    {
2820
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2821
17
      dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2822
17
      assert(dup_tensor_symbol_info[i].assign_ref);
2823
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2824
17
    }
2825
13
}
2826
2827
// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2828
// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2829
// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2830
static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2831
21
{
2832
21
  int i, j, k;
2833
45
  for (i = 0; i < p_node_info->output_size; 
i++24
)
2834
24
  {
2835
24
    const int d = p_node_info->outputs[i];
2836
24
    const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx) - 1;
2837
24
    if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref]))
2838
6
      continue;
2839
36
    
for (k = 0; 18
k < destination_size;
k++18
)
2840
18
      _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2841
    // Add the duplicated destinations to the tensor_block_ref.
2842
42
    for (j = 0; j < unroll_count; 
j++24
)
2843
48
      
for (k = 0; 24
k < destination_size;
k++24
)
2844
24
      {
2845
24
        const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2846
24
        const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2847
24
        if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2848
24
          _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2849
24
      }
2850
18
  }
2851
21
}
2852
2853
static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2854
21
{
2855
21
  int i, j;
2856
21
  ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2857
21
  ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2858
  // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2859
  // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2860
  // No need to change anything, we are good.
2861
21
  const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2862
21
  if (!unroll_count)
2863
8
    return;
2864
  // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2865
  // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2866
13
  ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2867
13
  int* dup_exec_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2868
13
  int* dup_tensor_block_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2869
13
  _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2870
13
  ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2871
13
  ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2872
26
  ccv_nnc_graph_visit_t* dup_visit = 
ccv_nnc_graph_visit_new13
(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0);
2873
13
  ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2874
26
  _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2875
  // Free out the old exec_dep
2876
26
  ccv_matrix_free(exec_dep);
2877
  // and the tensor blocks, prepare for the new.
2878
26
  _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2879
  // A reverse map to find where the original tensor comes from.
2880
26
  int* dup_tensor_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2881
142
  for (i = 0; i < dup_graph->tensor_symbol_info->rnum; 
i++129
)
2882
129
    dup_tensor_from_ref[i] = -1;
2883
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2884
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2885
115
      if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2886
104
        dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2887
26
  int* dup_exec_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2888
90
  for (i = 0; i < dup_graph->exec_symbol_info->rnum; 
i++77
)
2889
77
    dup_exec_from_ref[i] = -1;
2890
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2891
35
  {
2892
35
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2893
2
      continue;
2894
33
    dup_exec_from_ref[i] = i; // Reference back.
2895
77
    for (j = 0; j < unroll_count; 
j++44
)
2896
44
      if (dup_exec_ref[i * unroll_count + j] >= 0)
2897
44
        dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2898
33
  }
2899
  // Reset all attr.
2900
26
  memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2901
26
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2902
26
  ccv_nnc_graph_visit_free(dup_visit);
2903
26
  
ccfree13
(dup_exec_symbol_info);
2904
26
  
ccfree13
(dup_exec_from_ref);
2905
26
  
ccfree13
(dup_tensor_from_ref);
2906
  // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2907
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2908
    // Loop over all possible duplications to assign dup_p_ref properly.
2909
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2910
115
    {
2911
115
      const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2912
115
      if (dup_idx >= 0 && 
(104
tensor_blocks[i].p_refs[0]104
||
tensor_blocks[i].p_refs[1]60
))
2913
44
      {
2914
44
        const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2915
44
        const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2916
44
        if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2917
28
        {
2918
28
          if (!tensor_blocks[dup_idx].dup_p_refs)
2919
22
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2920
28
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2921
28
        }
2922
44
        if (p_ref_0_is_in_or_out == 1 || 
tensor_blocks[i].p_refs[1] == 016
)
2923
44
          continue;
2924
0
        const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2925
0
        const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2926
0
        if (p_ref_1_is_in_or_out == 1)
2927
0
        {
2928
0
          if (!tensor_blocks[dup_idx].dup_p_refs)
2929
0
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2930
0
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2931
0
        }
2932
0
      }
2933
115
    }
2934
  // companion_ref
2935
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2936
    // Now can assign them (The dup) as companion.
2937
78
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
dup_tensor_symbol_info[i].assign_ref71
)
2938
17
    {
2939
      // Get to the last one, which we will wrap over.
2940
17
      const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2941
17
      if (assign_ref >= 0)
2942
17
      {
2943
17
        int b_ref = assign_ref;
2944
17
        while (tensor_blocks[b_ref].ref)
2945
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2946
17
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2947
17
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2948
        // It cannot be that both i can hop to j can j can hop to i.
2949
        // And it can be hop from one to another now after duplication.
2950
17
        assert(a_hop_b > 0 || b_hop_a > 0);
2951
17
        tensor_blocks[i].companion_ref = b_ref + 1;
2952
17
        tensor_blocks[b_ref].companion_ref = i + 1;
2953
17
      }
2954
17
    }
2955
13
  ccfree(dup_tensor_symbol_info);
2956
  // Extend the dup tensor block ref, prepare for future extensions.
2957
13
  dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2958
110
  for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; 
i++97
)
2959
97
    dup_tensor_block_ref[i] = -1;
2960
  // Assign out changed properties.
2961
13
  *r_exec_dep = exec_dep;
2962
13
  *r_tensor_blocks = tensor_blocks;
2963
13
  *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2964
13
  *r_dup_graph = dup_graph;
2965
13
  *r_unroll_count = unroll_count;
2966
13
  *r_dup_exec_ref = dup_exec_ref;
2967
13
  *r_dup_tensor_block_ref = dup_tensor_block_ref;
2968
13
}
2969
2970
static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2971
31
{
2972
31
  if (!anonymous_block_free_list || 
!anonymous_block_free_list_cap15
)
2973
28
    return tensor_block_size;
2974
3
  int i;
2975
3
  const int no_dup_p_refs = (!dup_p_refs || 
!dup_p_refs->rnum0
);
2976
3
  int found_idx = tensor_block_size;
2977
3
  for (i = 0; i < anonymous_block_free_list_cap; 
i++0
)
2978
3
  {
2979
3
    const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i);
2980
3
    assert(idx < tensor_block_size);
2981
    // If the type doesn't match, ignore.
2982
3
    if (tensor_blocks[idx].type != type)
2983
0
      continue;
2984
    // Heuristic about how to select the best tensor block to move forward.
2985
    // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
2986
3
    if (tensor_blocks[idx].size >= size)
2987
3
    {
2988
3
      if (no_dup_p_refs)
2989
3
        return idx;
2990
      // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
2991
      // then we cannot do better than this, if that is the case, just return.
2992
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
2993
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
2994
0
        return idx;
2995
0
    }
2996
0
    int64_t found_idx_size_diff;
2997
0
    int64_t idx_size_diff;
2998
0
    if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
2999
      // Now, compare whether this one or the found_idx one is better.
3000
      // At this point, there is no point of comparing the dup_p_refs, we only care about which one
3001
      // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
3002
0
      (found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3003
0
    {
3004
0
      found_idx = idx;
3005
0
      continue;
3006
0
    }
3007
    // No need to update if found_idx is better than idx.
3008
0
    if (found_idx_size_diff > idx_size_diff)
3009
0
      continue;
3010
    // We bias towards the bigger one in case of similar.
3011
0
    if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3012
0
    {
3013
0
      found_idx = idx;
3014
0
      continue;
3015
0
    }
3016
0
    assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size);
3017
    // On a tie, check which one has tighter life-cycle.
3018
0
    if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3019
0
    {
3020
      // Check whether the current tensor blocks life-cycle is longer than the previous one.
3021
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3022
0
        (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3023
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3024
0
        found_idx = idx;
3025
0
      continue;
3026
0
    }
3027
    // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3028
    // We prefer to choose the one that has life-cycle closer to the expected ones.
3029
0
    if (no_dup_p_refs)
3030
0
    {
3031
      // Whoever is shorter wins.
3032
0
      if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3033
0
        (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3034
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3035
0
        found_idx = idx;
3036
0
      continue;
3037
0
    }
3038
0
    if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3039
0
      continue;
3040
0
    if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3041
0
    {
3042
0
      found_idx = idx;
3043
0
      continue;
3044
0
    }
3045
    // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3046
0
    const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3047
0
    const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3048
0
    if (idx_after_request && found_idx_after_request)
3049
0
    {
3050
0
      if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3051
0
        found_idx = idx;
3052
0
      continue;
3053
0
    } else {
3054
      // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3055
      // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3056
      // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3057
0
      if (!found_idx_after_request && (idx_after_request ||
3058
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3059
0
        found_idx = idx;
3060
0
      continue;
3061
0
    }
3062
0
  }
3063
0
  return found_idx;
3064
3
}
3065
3066
static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3067
49
{
3068
49
  if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3069
28
    return 0;
3070
21
  int i, j, k;
3071
21
  int input_size = 0;
3072
43
  for (i = 0; i < p_node_info->p_while.input_size; 
i++22
)
3073
22
    if (p_node_info->p_while.inputs[i] >= 0)
3074
2
      ++input_size;
3075
  // If doesn't have tensor inputs (thus, only special inputs), just return.
3076
21
  if (!input_size)
3077
19
    return 0;
3078
2
  ccv_nnc_tensor_symbol_t inputs[input_size];
3079
2
  input_size = 0;
3080
6
  for (i = 0; i < p_node_info->p_while.input_size; 
i++4
)
3081
4
    if (p_node_info->p_while.inputs[i] >= 0)
3082
2
      inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3083
2
        .d = p_node_info->p_while.inputs[i],
3084
2
        .graph = symbolic_graph,
3085
2
      };
3086
2
  assert(symbolic_graph->breakpoint_size > 0);
3087
2
  ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3088
2
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3089
4
  for (i = 0; i < symbolic_graph->breakpoint_size; 
i++2
)
3090
2
  {
3091
    // Make a noop copy of the breakpoint, but with some tensor inputs.
3092
2
    ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), inputs, input_size, 0, 0, 0);
3093
2
    ccv_array_push(dup_breakpoints, &noop);
3094
    // Connect this noop to the outgoing nodes of breakpoints.
3095
2
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d);
3096
2
    if (symbol_info->outgoings)
3097
4
      
for (j = 0; 2
j < symbol_info->outgoings->rnum;
j++2
)
3098
2
      {
3099
2
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
3100
2
        ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3101
2
          .d = d,
3102
2
          .graph = symbolic_graph,
3103
2
        });
3104
2
      }
3105
2
  }
3106
7
  for (i = 0; i < exec_symbol_info_size; 
i++5
)
3107
5
  {
3108
5
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
3109
5
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags))
3110
0
      continue;
3111
5
    if (symbol_info->outgoings)
3112
3
    {
3113
3
      const int outgoing_size = symbol_info->outgoings->rnum;
3114
6
      for (j = 0; j < outgoing_size; 
j++3
)
3115
3
      {
3116
3
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
3117
6
        for (k = 0; k < symbolic_graph->breakpoint_size; 
k++3
)
3118
3
          if (d == symbolic_graph->breakpoints[k].d)
3119
0
          {
3120
0
            ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k);
3121
0
            ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3122
0
              .d = i,
3123
0
              .graph = symbolic_graph,
3124
0
            }, noop);
3125
            // Found, connected, exit.
3126
0
            break;
3127
0
          }
3128
3
      }
3129
3
    }
3130
5
  }
3131
  // Add the dup_breakpoints to source if neccessary.
3132
2
  assert(symbolic_graph->sources);
3133
2
  const int source_size = symbolic_graph->sources->rnum;
3134
4
  for (i = 0; i < source_size; 
i++2
)
3135
2
  {
3136
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i))->d;
3137
2
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++0
)
3138
2
      if (d == symbolic_graph->breakpoints[j].d)
3139
2
      {
3140
2
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
3141
2
        ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3142
        // Found, made, exit.
3143
2
        break;
3144
2
      }
3145
2
  }
3146
  // Add the dup_breakpoints to destination if neccessary.
3147
2
  assert(symbolic_graph->destinations);
3148
2
  const int destination_size = symbolic_graph->destinations->rnum;
3149
4
  for (i = 0; i < destination_size; 
i++2
)
3150
2
  {
3151
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i))->d;
3152
4
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++2
)
3153
2
      if (d == symbolic_graph->breakpoints[j].d)
3154
0
      {
3155
0
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
3156
0
        ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3157
        // Found, made, exit.
3158
0
        break;
3159
0
      }
3160
2
  }
3161
2
  return dup_breakpoints;
3162
2
}
3163
3164
// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3165
static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3166
6.22k
{
3167
6.22k
  assert(source_size > 0);
3168
6.22k
  assert(destination_size > 0);
3169
  // First, fill all the "auto" holes.
3170
  // This is the symbol table that with "auto" info filled up.
3171
6.22k
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3172
6.22k
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3173
6.22k
  ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3174
12.4k
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new6.22k
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
3175
0
  ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3176
12.4k
  int i, j, k, p, q;
3177
12.4k
  const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = 
p_exec_symbol_info6.22k
?
p_exec_symbol_info + (symbolic_graph->exec_idx - 1)49
:
06.17k
;
3178
12.4k
  ccv_sparse_matrix_t* exec_dep;
3179
12.4k
  ccv_nnc_tensor_block_t* tensor_blocks;
3180
12.4k
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3181
12.4k
  int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3182
  // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3183
  // are automatically filled in, and all the sub-graphs are processed.
3184
  // There is a last step though, for a while loop, it is parameterized:
3185
  // while (x > 5) {
3186
  //     y = x + 1;
3187
  // } (y => x) // This means after this loop is done, y's value will be copied over to x.
3188
  // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3189
  // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3190
  // it is a inplace operation.
3191
  // But if y cannot be x's alias, for example, this while loop looks like this:
3192
  // while (x > 5) {
3193
  //     y = x + a
3194
  //     b = x + y
3195
  // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3196
  // For this example, y cannot be x's alias because x is used later to compute b (and that computation
3197
  // has dependency on y as well).
3198
  // For this case, we need to modify the computation graph. Previously, the graph looks like this:
3199
  // y = x + a -> b = x + y
3200
  // This graph will be extended to look like this:
3201
  // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3202
  // while (x0 > 5) {
3203
  //     y0 = x0 + a0
3204
  //     b0 = x0 + y0
3205
  //     if (y0 > 5) break
3206
  //     y1 = y0 + b0
3207
  //     b1 = y0 + y1
3208
  // } (y1 => x0, b1 => a0)
3209
  // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3210
  // with each other now).
3211
  // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3212
  // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3213
12.4k
  ccv_nnc_symbolic_graph_t* dup_graph = 0;
3214
12.4k
  int* dup_exec_ref = 0;
3215
12.4k
  int* dup_tensor_block_ref = 0;
3216
12.4k
  int unroll_count = 0;
3217
  // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3218
12.4k
  ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)
ccmalloc6.22k
(sizeof(ccv_nnc_symbolic_graph_prep_t));
3219
12.4k
  prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3220
12.4k
  prep->flags = 0;
3221
  // Cannot handle dup a node that is a graph as well.
3222
12.4k
  if (
p_exec_symbol_info6.22k
)
3223
49
  {
3224
49
    prep->flags = p_node_info->flags;
3225
49
    if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3226
21
    {
3227
21
      _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3228
21
      _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3229
28
    } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3230
      // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3231
28
    }
3232
49
  }
3233
12.4k
  ccv_nnc_symbolic_graph_prep_t** sub_preps = 
symbolic_graph->sub_graphs6.22k
&&
symbolic_graph->sub_graphs->rnum29
?
(ccv_nnc_symbolic_graph_prep_t**)29
cccalloc29
(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) :
06.19k
;
3234
12.4k
  ccv_array_t* anonymous_block_free_list = 0;
3235
12.4k
  const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3236
  // Record whether this tensor is folded in this round.
3237
12.4k
  uint32_t* const tensor_fold = (uint32_t*)
ccmalloc6.22k
(sizeof(uint32_t) * tensor_fold_size);
3238
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
3239
32.1k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3240
49
    {
3241
49
      assert(symbolic_graph->sub_graphs);
3242
49
      ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1);
3243
49
      ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3244
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3245
49
      sub_prep->dup_breakpoints = dup_breakpoints;
3246
49
      sub_prep->p = prep;
3247
49
      sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1] = sub_prep;
3248
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3249
49
      const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3250
293
      for (i = 0; i < s_alloc_prep->block_size; 
i++244
)
3251
244
      {
3252
244
        const int block_ref = s_alloc_prep->blocks[i].block_ref;
3253
244
        const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3254
244
        if (block_ref < sub_prep->tensor_symbol_info_size)
3255
192
        {
3256
          // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3257
          // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3258
192
          if (s_tensor_blocks[block_ref].bypass_ref)
3259
1
          {
3260
1
            int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3261
1
            while (s_tensor_blocks[bypass_ref].ref)
3262
0
              bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3263
1
            if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3264
1
              
s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1]0
)
3265
1
              continue;
3266
1
          }
3267
191
          if (s_tensor_blocks[block_ref].p_refs[0])
3268
91
          {
3269
            /* If it is already properly assigned, next. */
3270
91
            if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3271
91
              s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3272
91
            {
3273
91
              if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3274
90
                s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3275
1
              else {
3276
1
                assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3277
1
                s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3278
1
              }
3279
91
            }
3280
            /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3281
91
            if (s_tensor_blocks[block_ref].p_refs[1] &&
3282
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1]3
&&
3283
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1]3
)
3284
3
            {
3285
3
              assert(s_alloc_prep->buffers[buffer_ref].p_refs[0]);
3286
3
              assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3287
3
              s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3288
3
            }
3289
91
          }
3290
191
        } else 
if (52
s_tensor_blocks[block_ref].dup_p_refs52
) {
3291
          /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3292
           * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3293
           * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3294
           * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3295
           * its life-time to the end of the output tensor. */
3296
15
          if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3297
13
            s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3298
33
          for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; 
j++18
)
3299
18
            ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j));
3300
15
        }
3301
244
      }
3302
49
    }
3303
32.1k
    const int init_tensor_block_size = tensor_block_size;
3304
32.1k
    int rw_anonymous_buffer_size_cap = 0;
3305
32.1k
    int ro_anonymous_buffer_size_cap = 0;
3306
32.1k
    if (anonymous_block_free_list)
3307
17
      ccv_array_clear(anonymous_block_free_list);
3308
32.1k
    memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3309
32.1k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3310
49
    {
3311
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1];
3312
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3313
49
      int rw_anonymous_buffer_size = 0;
3314
49
      int ro_anonymous_buffer_size = 0;
3315
229
      for (i = 0; i < s_alloc_prep->buffer_size; 
i++180
)
3316
180
        if (s_alloc_prep->buffers[i].p_refs[0])
3317
90
        {
3318
          /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3319
90
          int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3320
          /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3321
90
          int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3322
90
          assert(p_ref_0_is_in_or_out != 0);
3323
90
          int unref_p_ref_0 = p_ref_0;
3324
92
          while (tensor_blocks[unref_p_ref_0].ref)
3325
2
            unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3326
          /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3327
90
          assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3328
90
          if (s_alloc_prep->buffers[i].p_refs[1])
3329
4
          {
3330
4
            int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3331
4
            const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3332
4
            assert(p_ref_1_is_in_or_out != 0);
3333
4
            int unref_p_ref_1 = p_ref_1;
3334
4
            while (tensor_blocks[unref_p_ref_1].ref)
3335
0
              unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3336
            /* See above comment for the similar p_ref_0 check. */
3337
4
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]));
3338
4
            assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out);
3339
4
            int p_ref_t;
3340
4
            if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3341
3
            {
3342
3
              CCV_SWAP(p_ref_0, p_ref_1, p_ref_t);
3343
3
              CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t);
3344
3
            }
3345
4
            p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3346
            /* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3347
4
            if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
3348
4
            {
3349
4
              const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3350
4
              if (folded)
3351
1
              {
3352
1
                p_ref_0 = p_ref_1;
3353
1
                unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3354
1
                tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3355
1
                for (j = 0; j < unroll_count; 
j++0
) /* Fold its duplicates as well. */
3356
0
                {
3357
0
                  const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3358
0
                  assert(folded && "the subsequent duplicates can be folded too.");
3359
0
                }
3360
1
              }
3361
4
            }
3362
4
          }
3363
          /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3364
           * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3365
           * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3366
           * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3367
           * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3368
           * associated with it, then we are good. */
3369
90
          if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3370
90
            
(89
p_ref_0_is_in_or_out == 189
&&
_ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)50
) ||
3371
90
            
(39
p_ref_0_is_in_or_out == -139
&&
_ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)39
) ||
3372
90
            
TENSOR_READ_WRITE8
(s_alloc_prep->buffers[i]) == READ_ONLY8
)
3373
86
          {
3374
86
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3375
27
              { assert(s_alloc_prep->buffers[i].p_refs[1] == 0); }
3376
            /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3377
             * is a long argument why that is the case, the digest is, it is much easier to control your output
3378
             * than your input). */
3379
86
            s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3380
86
            s_alloc_prep->buffers[i].p_refs[1] = 0;
3381
            /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3382
86
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3383
86
            tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size);
3384
95
            for (j = 0; j < unroll_count; 
j++9
) /* Change the size of its duplicates as well. */
3385
9
              tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3386
9
                tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3387
9
                  tensor_blocks[unref_p_ref_0].size;
3388
86
          } else {
3389
4
            s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3390
4
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3391
0
              ++ro_anonymous_buffer_size;
3392
4
            else
3393
4
              rw_anonymous_buffer_size += unroll_count + 1;
3394
4
          }
3395
90
        } else {
3396
90
          if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3397
63
            ++ro_anonymous_buffer_size;
3398
27
          else
3399
27
            rw_anonymous_buffer_size += unroll_count + 1;
3400
90
        }
3401
49
      if (ro_anonymous_buffer_size || 
rw_anonymous_buffer_size24
)
3402
28
      {
3403
28
        const int anonymous_block_free_list_cap = anonymous_block_free_list ? 
anonymous_block_free_list->rnum6
:
022
;
3404
        // All read-write buffer (potentially) can be reused between each case..of branch.
3405
28
        rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3406
        // Read-only buffer cannot be reused between each case..of branch.
3407
28
        ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3408
        /* Anonymous block, allocate additional tensor blocks for this. */
3409
        /* This is either because this is an internal tensor (don't have p_ref) */
3410
        /* or it is an anonymous block itself within the sub graphs of this while graph. */
3411
28
        tensor_blocks = (ccv_nnc_tensor_block_t*)ccrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3412
28
        memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3413
28
        if (dup_tensor_block_ref)
3414
3
          dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3415
174
        for (i = 0; i < s_alloc_prep->buffer_size; 
i++146
)
3416
146
          if (!s_alloc_prep->buffers[i].p_refs[0])
3417
94
          {
3418
94
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3419
63
            {
3420
63
              assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap);
3421
63
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size]);
3422
63
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3423
63
              tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3424
63
              tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3425
63
              tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3426
63
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3427
63
              tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3428
63
              ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3429
63
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3430
63
              if (dup_p_refs && 
dup_p_refs->rnum > 00
)
3431
0
              {
3432
0
                for (j = 0; j < dup_p_refs->rnum; j++)
3433
0
                {
3434
0
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3435
0
                  assert(dup_p_ref >= 0);
3436
0
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3437
0
                  assert(tensor_blocks[dup_p_ref].tail);
3438
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3439
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3440
0
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3441
0
                  {
3442
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3443
0
                    assert(p_node_info);
3444
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3445
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3446
0
                    {
3447
0
                      if (!tensor_blocks[tensor_block_size].dup_p_refs)
3448
0
                        tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3449
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3450
0
                    }
3451
0
                  }
3452
0
                  if (!tensor_blocks[tensor_block_size].tail)
3453
0
                    tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3454
0
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3455
0
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_size]);
3456
0
                }
3457
63
              } else {
3458
63
                tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3459
63
                ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3460
63
              }
3461
132
              
for (j = 0; 63
j < source_size;
j++69
)
3462
69
                _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3463
              /* If this is a read-only (based on SSA, if first encountered as read), and this is
3464
               * sub-graph. Mark it to the end of the graph. */
3465
63
              if (p_exec_symbol_info)
3466
12
                
for (j = 0; 6
j < destination_size;
j++6
)
3467
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3468
              /* If it is read-only, it is self-reflecting. */
3469
69
              for (k = 0; k < unroll_count; 
k++6
)
3470
6
              {
3471
12
                for (j = 0; j < destination_size; 
j++6
)
3472
6
                  if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3473
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3474
                /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3475
6
                assert(symbolic_graph->p);
3476
6
                dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3477
6
              }
3478
63
              ++tensor_block_size;
3479
63
            } else {
3480
31
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3481
31
              const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3482
31
              const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3483
              // Find suitable tensor block from the free list.
3484
31
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3485
31
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3486
31
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3487
31
              if (new_anonymous_tensor_block)
3488
28
              {
3489
28
                tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3490
28
                tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3491
28
                tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3492
28
                tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3493
28
                ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3494
28
              } else {
3495
3
                tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3496
3
                tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3497
3
              }
3498
31
              if (dup_p_refs && 
dup_p_refs->rnum > 05
)
3499
5
              {
3500
10
                for (j = 0; j < dup_p_refs->rnum; 
j++5
)
3501
5
                {
3502
5
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3503
5
                  assert(dup_p_ref >= 0);
3504
5
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3505
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3506
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3507
5
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3508
0
                  {
3509
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3510
0
                    assert(p_node_info);
3511
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3512
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3513
0
                    {
3514
0
                      if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3515
0
                        tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3516
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3517
0
                    }
3518
0
                  }
3519
5
                  assert(tensor_blocks[dup_p_ref].tail);
3520
5
                  if (!tensor_blocks[tensor_block_idx].tail)
3521
5
                    tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3522
10
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; 
k++5
)
3523
5
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_idx]);
3524
                  // We have to add it to the warp around companion_ref as well.
3525
                  // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3526
                  // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3527
                  // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3528
                  // gaurantee may be broken down in the line.
3529
5
                  if (tensor_blocks[dup_p_ref].companion_ref)
3530
0
                  {
3531
0
                    const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3532
0
                    for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3533
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3534
0
                    for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3535
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3536
0
                  }
3537
5
                }
3538
26
              } else if (new_anonymous_tensor_block) {
3539
23
                tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3540
23
                ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3541
23
              }
3542
31
              const int prev_tensor_block_idx = tensor_block_idx;
3543
31
              if (new_anonymous_tensor_block)
3544
28
              {
3545
28
                if (!anonymous_block_free_list)
3546
16
                  anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3547
28
                ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3548
28
                ++tensor_block_size;
3549
28
              }
3550
32
              for (k = 0; k < unroll_count; 
k++1
)
3551
1
              {
3552
1
                const int tensor_block_idx = new_anonymous_tensor_block ?
3553
1
                  (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3554
1
                  
dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k]0
;
3555
1
                TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3556
1
                TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3557
1
                if (new_anonymous_tensor_block)
3558
1
                {
3559
1
                  tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3560
1
                  tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3561
1
                  tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3562
1
                  tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3563
                  /* Attach to duplicated exec for this tensor block. */
3564
1
                  ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3565
1
                } else {
3566
0
                  tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3567
0
                  tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3568
0
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3569
3570
0
                }
3571
1
                if (dup_p_refs && dup_p_refs->rnum > 0)
3572
1
                {
3573
                  /* Not nil, not self-reflecting. */
3574
2
                  for (j = 0; j < dup_p_refs->rnum; 
j++1
)
3575
1
                  {
3576
1
                    const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3577
1
                    assert(dup_p_ref >= 0);
3578
1
                    assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3579
                    // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3580
                    // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3581
1
                    if (tensor_symbol_info[dup_p_ref].p_ref)
3582
0
                    {
3583
0
                      const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3584
0
                      assert(p_node_info);
3585
0
                      const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3586
0
                      if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3587
0
                      {
3588
0
                        if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3589
0
                          tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3590
0
                        ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3591
0
                      }
3592
0
                    }
3593
1
                    assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref);
3594
1
                    const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3595
1
                    assert(tensor_blocks[dup_dup_p_ref].tail);
3596
1
                    if (!tensor_blocks[tensor_block_idx].tail)
3597
1
                      tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3598
2
                    for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; 
q++1
)
3599
1
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q), tensor_blocks[tensor_block_idx]);
3600
                    // We have to add it to the warp around companion_ref as well.
3601
1
                    if (tensor_blocks[dup_dup_p_ref].companion_ref)
3602
0
                    {
3603
0
                      const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3604
0
                      for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3605
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3606
0
                      for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3607
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3608
0
                    }
3609
1
                  }
3610
1
                } else 
if (0
new_anonymous_tensor_block0
) {
3611
0
                  tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3612
0
                  ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3613
0
                }
3614
1
                if (new_anonymous_tensor_block)
3615
1
                  ++tensor_block_size;
3616
1
              }
3617
31
            }
3618
94
          }
3619
28
      }
3620
49
    }
3621
32.1k
  } ccv_nnc_graph_visit_endfor
3622
6.22k
  if (anonymous_block_free_list)
3623
16
    ccv_array_free(anonymous_block_free_list);
3624
6.22k
  ccfree(tensor_fold);
3625
  // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3626
  // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3627
6.22k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3628
6.22k
  prep->while_count_tensor = 0;
3629
6.22k
  prep->dup_breakpoints = 0;
3630
6.22k
  prep->p = 0;
3631
6.22k
  prep->symbolic_graph = symbolic_graph;
3632
6.22k
  prep->p_idx = symbolic_graph->p_idx;
3633
6.22k
  prep->exec_idx = symbolic_graph->exec_idx;
3634
6.22k
  prep->sub_prep_size = symbolic_graph->sub_graphs ? 
symbolic_graph->sub_graphs->rnum29
:
06.19k
;
3635
6.22k
  prep->sub_preps = sub_preps;
3636
6.22k
  prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3637
6.22k
  prep->exec_symbol_info = exec_symbol_info;
3638
6.22k
  prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3639
6.22k
  prep->tensor_symbol_info = tensor_symbol_info;
3640
6.22k
  prep->unroll_count = unroll_count;
3641
6.22k
  prep->dup_tensor_block_ref = dup_tensor_block_ref;
3642
6.22k
  prep->tensor_block_size = tensor_block_size;
3643
6.22k
  prep->tensor_blocks = tensor_blocks;
3644
6.22k
  prep->exec_flags = exec_flags;
3645
6.22k
  prep->visit = visit;
3646
6.22k
  prep->alloc_prep = alloc_prep;
3647
6.22k
  if (dup_graph)
3648
13
    ccv_nnc_symbolic_graph_free(dup_graph);
3649
6.22k
  if (dup_exec_ref)
3650
13
    ccfree(dup_exec_ref);
3651
6.22k
  return prep;
3652
12.4k
}
3653
3654
static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3655
6.22k
{
3656
6.22k
  int i;
3657
6.22k
  _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3658
6.22k
  ccfree(prep->exec_flags);
3659
6.27k
  for (i = 0; i < prep->sub_prep_size; 
i++50
)
3660
50
    if (prep->sub_preps[i])
3661
49
      _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3662
6.22k
  if (prep->sub_preps)
3663
29
    ccfree(prep->sub_preps);
3664
6.22k
  ccfree(prep->tensor_symbol_info);
3665
6.22k
  ccfree(prep->exec_symbol_info);
3666
6.22k
  if (prep->dup_tensor_block_ref)
3667
13
    ccfree(prep->dup_tensor_block_ref);
3668
6.22k
  _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3669
6.22k
  ccv_nnc_graph_visit_free(prep->visit);
3670
6.22k
  ccfree(prep);
3671
6.22k
}
3672
3673
static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3674
6.22k
{
3675
6.22k
  int i, j;
3676
32.1k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
3677
32.1k
    if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3678
21
    {
3679
21
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3680
21
      assert(graph_ref >= 0);
3681
21
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3682
43
      for (i = 0; i < node->p_while.input_size; 
i++22
)
3683
22
        if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i]))
3684
20
        {
3685
20
          ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3686
20
          const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i]);
3687
21
          for (j = 0; j < d; 
j++1
)
3688
1
            prep = prep->p;
3689
20
          prep->while_count_tensor = 1;
3690
20
        }
3691
21
    }
3692
32.1k
    
for (i = 0; 32.1k
i < node->graph_ref_size;
i++49
)
3693
49
    {
3694
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3695
49
      if (graph_ref >= 0)
3696
49
        _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3697
49
    }
3698
32.1k
  } ccv_nnc_graph_visit_endfor
3699
6.22k
}
3700
3701
static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3702
90.7k
{
3703
90.7k
  if (symbol >= 0)
3704
64.2k
    return graph_prep->tensor_arena->vt_tensors[symbol];
3705
26.5k
  if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3706
26.5k
    return 0;
3707
26.5k
  assert
(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))20
;
3708
20
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3709
20
  int i;
3710
20
  const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol);
3711
21
  for (i = 0; i < d; 
i++1
)
3712
1
    prep = prep->p;
3713
20
  assert(prep->while_count_tensor);
3714
20
  return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3715
20
}
3716
3717
static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3718
6.22k
{
3719
6.22k
  int i;
3720
6.22k
  int* const exec_cvt = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
3721
6.22k
  ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3722
6.22k
  graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3723
6.22k
  graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3724
6.22k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3725
58.2k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++52.0k
)
3726
52.0k
    if (graph_execs[i].graph == graph)
3727
32.1k
      graph_execs[i].d = exec_cvt[graph_execs[i].d];
3728
6.22k
  ccfree(exec_cvt);
3729
6.22k
}
3730
3731
static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3732
6.22k
{
3733
6.22k
  int i, j, k;
3734
6.22k
  ccv_nnc_graph_t* const graph = graph_prep->graph;
3735
6.22k
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3736
6.22k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
3737
6.22k
  graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3738
6.22k
  graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3739
6.22k
  graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3740
6.22k
  graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3741
6.22k
  memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3742
6.22k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3743
6.22k
  int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3744
58.2k
  for (i = 0; i < exec_symbol_info_size; 
i++52.0k
)
3745
52.0k
  {
3746
52.0k
    max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size);
3747
52.0k
    max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size);
3748
52.0k
    if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3749
22
      max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size);
3750
52.0k
    graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3751
52.0k
    graph_execs[i].graph = 0;
3752
52.0k
  }
3753
6.27k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
3754
50
    max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size);
3755
6.22k
  ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)];
3756
6.22k
  ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)];
3757
6.22k
  ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)];
3758
6.22k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3759
6.22k
  const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3760
  // Create node, this is in topological order.
3761
32.1k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3762
32.1k
    if (CCV_NO_GRAPH_EXEC(graph_execs[idx]))
3763
32.1k
    {
3764
122k
      for (i = 0; i < node->input_size; 
i++90.7k
)
3765
90.7k
        max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3766
82.3k
      for (i = 0; i < node->output_size; 
i++50.2k
)
3767
50.2k
        max_outputs[i] = node->outputs[i] >= 0 ? 
tensor_arena->vt_tensors[node->outputs[i]]41.3k
:
08.91k
;
3768
32.1k
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3769
21
      {
3770
21
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3771
21
        assert(graph_ref >= 0);
3772
21
        ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3773
21
        ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3774
21
        graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3775
21
        const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3776
21
        ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3777
21
        ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3778
43
        for (i = 0; i < node->p_while.input_size; 
i++22
)
3779
22
          max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3780
42
        for (i = 0; i < sub_symbolic_graph->breakpoint_size; 
i++21
)
3781
21
          max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3782
21
        ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3783
21
        _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3784
32.0k
      } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3785
24
        for (i = 0; i < node->output_size; 
i++13
)
3786
13
          if (max_outputs[i] && max_outputs[i]->alias_ref)
3787
10
            max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3788
11
        graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3789
        // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3790
22
        for (i = 0; i < node->case_of.argument.offset; 
i++11
)
3791
11
        {
3792
11
          ccv_nnc_tensor_t* const update = max_inputs[i];
3793
11
          if (!CCV_IS_TENSOR_MULTIVIEW(update)) // No need if it is a naked tensor.
3794
9
            continue;
3795
2
          int flag = 0;
3796
2
          for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; 
j++0
)
3797
0
            flag = (update == max_inputs[j]);
3798
2
          if (!flag)
3799
2
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3800
2
        }
3801
11
        const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 
11
:
010
;
3802
11
        ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3803
11
        if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3804
1
        {
3805
          // Add another graph for data transfer.
3806
1
          ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3807
2
          for (i = 0; i < node->output_size; 
i++1
)
3808
1
            max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 
00
;
3809
1
          ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size), max_outputs, ccv_min(node->input_size, node->output_size));
3810
1
          ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3811
1
          ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3812
1
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3813
1
          int exec_cvt;
3814
1
          ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3815
1
        }
3816
39
        for (i = 0; i < node->graph_ref_size; 
i++28
)
3817
28
        {
3818
28
          const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3819
28
          if (graph_ref < 0)
3820
0
            continue;
3821
28
          ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3822
28
          const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3823
28
          ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3824
28
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3825
28
          _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3826
28
        }
3827
32.0k
      } else {
3828
32.0k
        graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3829
32.0k
      }
3830
32.1k
      ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3831
32.1k
    }
3832
32.1k
  } ccv_nnc_graph_visit_endfor
3833
  // Then connect them.
3834
32.1k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3835
32.1k
    if (node->outgoings)
3836
53.1k
      
for (i = 0; 25.2k
i < node->outgoings->rnum;
i++27.9k
)
3837
27.9k
      {
3838
27.9k
        const int outgoing = *(int*)ccv_array_get(node->outgoings, i);
3839
27.9k
        if (graph_execs[outgoing].graph)
3840
27.6k
          ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3841
27.9k
      }
3842
32.1k
  } ccv_nnc_graph_visit_endfor
3843
6.22k
  int source_exec_created = 0;
3844
6.22k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3845
6.22k
  const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3846
6.22k
  ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3847
  // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3848
97.5k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++91.2k
)
3849
91.2k
  {
3850
91.2k
    if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags))
3851
127
    {
3852
127
      int ref = i;
3853
127
      while (tensor_symbol_info[ref].alias_ref)
3854
0
        ref = tensor_symbol_info[ref].alias_ref - 1;
3855
127
      while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref39
)
3856
0
        ref = tensor_blocks[ref].ref - 1;
3857
      // This is not computable. It could be that we marked a const tensor as init zero.
3858
127
      if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]))
3859
39
        continue;
3860
      // If this tensor is not used by any exec, we don't need to init at all. Skip.
3861
88
      if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3862
0
        continue;
3863
88
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3864
      // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3865
88
      ccv_nnc_graph_exec_t set_exec;
3866
88
      if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3867
27
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3868
61
      else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3869
61
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3870
176
      for (j = 0; j < tensor_blocks[ref].head->rnum; 
j++88
)
3871
88
      {
3872
88
        const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j);
3873
88
        if (outgoing >= exec_symbol_info_size)
3874
0
          continue;
3875
88
        assert(outgoing >= 0);
3876
88
        assert(graph_execs[outgoing].graph);
3877
88
        ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3878
88
      }
3879
88
      int flags = 0;
3880
88
      if (alloc_dep[ref])
3881
50
        
for (j = 0; 25
j < alloc_dep[ref]->rnum;
j++25
)
3882
25
        {
3883
25
          const int d = *(int*)ccv_array_get(alloc_dep[ref], j);
3884
          // This is from alloc_dep, it should be computable.
3885
25
          assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
3886
25
          if (tensor_blocks[d].tail)
3887
50
            
for (k = 0; 25
k < tensor_blocks[d].tail->rnum;
k++25
)
3888
25
            {
3889
25
              const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
3890
25
              if (incoming >= exec_symbol_info_size)
3891
0
                continue;
3892
25
              assert(incoming >= 0);
3893
25
              assert(graph_execs[incoming].graph);
3894
25
              ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3895
25
              flags = 1;
3896
25
            }
3897
25
        }
3898
      // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3899
88
      if (!flags)
3900
63
      {
3901
63
        if (!source_exec_created)
3902
40
        {
3903
40
          graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3904
40
          source_exec_created = 1;
3905
40
        }
3906
63
        ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3907
63
      }
3908
88
    }
3909
91.2k
  }
3910
  // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3911
  // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3912
  // with its alias).
3913
6.22k
  assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size);
3914
97.5k
  
for (i = 0; 6.22k
i < tensor_arena->vt_tensor_size;
i++91.2k
)
3915
91.2k
  {
3916
91.2k
    ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3917
    // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3918
91.2k
    if (mv && 
CCV_IS_TENSOR_MULTIVIEW82.9k
(mv))
3919
53
    {
3920
53
      const ccv_array_t* const head = tensor_blocks[i].head;
3921
53
      if (head && 
head->rnum > 047
)
3922
94
        
for (j = 0; 47
j < head->rnum;
j++47
)
3923
47
        {
3924
47
          const int idx = *(int*)ccv_array_get(head, j);
3925
47
          if (idx >= exec_symbol_info_size)
3926
1
            continue;
3927
47
          assert
(idx >= 0)46
;
3928
46
          const int d = graph_execs[idx].d;
3929
46
          ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d);
3930
46
          int flag = 0;
3931
46
          if (exec_info->tensor_wraps_ref)
3932
32
          {
3933
32
            ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1);
3934
113
            for (k = 0; k < tensor_wrap_array->size && 
!flag88
;
k++81
)
3935
81
              flag = (tensor_wrap_array->tensor_wraps[k] && 
tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv55
);
3936
32
          }
3937
          // If none is in the flag, it need to be included in the cast.
3938
46
          if (!flag)
3939
19
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3940
46
        }
3941
53
    }
3942
91.2k
  }
3943
  // Create source / destination phony node. This is to facilitate use of compiled graph.
3944
  // Also, this is needed if you have init zero execs.
3945
6.22k
  if (source_exec_created || 
source_size > 16.18k
)
3946
132
  {
3947
132
    if (!source_exec_created)
3948
92
      graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3949
561
    for (i = 0; i < source_size; 
i++429
)
3950
429
      ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3951
6.09k
  } else {
3952
6.09k
    assert(!source_exec_created);
3953
6.09k
    assert(source_size == 1);
3954
6.09k
    graph_exec_arena->source = graph_execs[sources[0].d];
3955
6.09k
  }
3956
6.22k
  if (destination_size == 1)
3957
6.14k
    graph_exec_arena->destination = graph_execs[destinations[0].d];
3958
86
  else {
3959
86
    graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3960
1.08k
    for (i = 0; i < destination_size; 
i++995
)
3961
995
      ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3962
86
  }
3963
6.22k
  ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3964
6.22k
  ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3965
6.22k
  return graph_exec_arena;
3966
6.22k
}
3967
3968
static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3969
11
{
3970
11
  if (graph_prep->symbolic_graph == pair)
3971
4
    return graph_prep->graph;
3972
7
  int i;
3973
10
  for (i = 0; i < graph_prep->sub_prep_size; 
i++3
)
3974
7
    if (graph_prep->sub_preps[i])
3975
7
    {
3976
7
      ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
3977
7
      if (graph)
3978
4
        return graph;
3979
7
    }
3980
3
  return 0;
3981
7
}
3982
3983
static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3984
6.17k
{
3985
6.17k
  int i;
3986
6.22k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++43
)
3987
43
    if (graph_prep->sub_preps[i])
3988
42
    {
3989
42
      if (graph_prep->sub_preps[i]->symbolic_graph->pair)
3990
4
        graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
3991
42
    }
3992
6.17k
}
3993
3994
static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3995
6.22k
{
3996
6.22k
  assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
3997
6.22k
  int i;
3998
58.2k
  for (i = 0; i < graph_prep->exec_symbol_info_size; 
i++52.0k
)
3999
52.0k
  {
4000
52.0k
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags))
4001
12
      continue;
4002
52.0k
    if (graph_exec_arena->graph_execs[i].graph && 
graph_prep->exec_symbol_info[i].pair_ref32.1k
)
4003
15.8k
    {
4004
15.8k
      ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4005
15.8k
        .d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4006
15.8k
        .graph = graph_prep->symbolic_graph->pair ? 
graph_prep->symbolic_graph->pair4
:
graph_prep->symbolic_graph15.8k
,
4007
15.8k
      });
4008
15.8k
      if (pair_exec.d >= 0)
4009
587
        ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4010
15.8k
    }
4011
52.0k
  }
4012
6.27k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
4013
50
    if (graph_prep->sub_preps[i])
4014
49
      _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4015
6.22k
}
4016
4017
static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4018
6.22k
{
4019
6.22k
  int i;
4020
6.22k
  if (graph_prep->dup_breakpoints)
4021
2
  {
4022
    // Strip the const modifier only possible because it is a sub-graph.
4023
2
    ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4024
4
    for (i = 0; i < graph_prep->dup_breakpoints->rnum; 
i++2
)
4025
2
      ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i));
4026
2
    ccv_array_free(graph_prep->dup_breakpoints);
4027
2
    graph_prep->dup_breakpoints = 0;
4028
2
    graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4029
    // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4030
2
    memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4031
    // Since exec_symbol_info changed, create a new visit object.
4032
2
    assert(symbolic_graph->sources);
4033
2
    assert(symbolic_graph->destinations);
4034
2
    ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0);
4035
2
    const int source_size = symbolic_graph->sources->rnum;
4036
2
    ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0);
4037
2
    const int destination_size = symbolic_graph->destinations->rnum;
4038
4
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new2
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
4039
0
    ccv_nnc_graph_visit_free(graph_prep->visit);
4040
4
    graph_prep->visit = visit;
4041
4
    assert(graph_prep->p);
4042
2
    ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4043
2
  }
4044
32.1k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
4045
32.1k
    for (i = 0; i < node->graph_ref_size; 
i++49
)
4046
49
    {
4047
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
4048
49
      if (graph_ref >= 0)
4049
49
        _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4050
49
    }
4051
32.1k
  } ccv_nnc_graph_visit_endfor
4052
6.22k
}
4053
4054
const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4055
4056
void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4057
6.17k
{
4058
6.17k
  assert(graph_ref);
4059
6.17k
  assert(tensor_arena_ref);
4060
6.17k
  assert(graph_exec_arena_ref);
4061
6.17k
  int i;
4062
  // Cannot bind the multi-view.
4063
53.7k
  for (i = 0; i < tensor_bind_size; 
i++47.5k
)
4064
47.5k
  {
4065
47.5k
    assert(tensor_binds[i].tensor);
4066
47.5k
    assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor));
4067
47.5k
  }
4068
6.17k
  ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4069
6.17k
  _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4070
6.17k
  ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4071
6.17k
  _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4072
6.17k
  *tensor_arena_ref = tensor_arena;
4073
  // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4074
6.17k
  _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4075
  // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4076
6.17k
  _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4077
6.17k
  *graph_ref = graph_prep->graph;
4078
6.17k
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4079
6.17k
  _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4080
6.17k
  _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4081
6.17k
  *graph_exec_arena_ref = graph_exec_arena;
4082
6.17k
  _ccv_nnc_symbolic_graph_prep_free(graph_prep);
4083
6.17k
}
4084
4085
static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4086
6.22k
{
4087
  // Buffers are inherited from above, no need to dealloc.
4088
6.22k
  int i;
4089
6.27k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
4090
50
    if (tensor_arena->sub_arenas[i])
4091
49
      _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4092
6.28k
  for (i = 0; i < tensor_arena->m_tensor_idx->rnum; 
i++61
)
4093
61
  {
4094
61
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i));
4095
61
    assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv));
4096
61
    ccv_nnc_tensor_multiview_free(*mv);
4097
61
  }
4098
6.22k
  ccv_array_free(tensor_arena->tensor_metadata);
4099
6.22k
  ccv_array_free(tensor_arena->m_tensor_idx);
4100
6.22k
  if (tensor_arena->pb_vt_tensors)
4101
73
    ccfree(tensor_arena->pb_vt_tensors);
4102
6.22k
  if (tensor_arena->vt_alias_r_refs_p)
4103
73
    ccfree(tensor_arena->vt_alias_r_refs_p);
4104
6.22k
  if (tensor_arena->vt_sizes)
4105
5
    ccfree(tensor_arena->vt_sizes);
4106
6.22k
  ccfree(tensor_arena);
4107
6.22k
}
4108
4109
void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4110
83.4k
{
4111
83.4k
  assert(tensor_arena->graph_ref == (intptr_t)symbol.graph);
4112
83.4k
  assert(symbol.d < tensor_arena->vt_tensor_size);
4113
83.4k
  assert(symbol.d >= 0);
4114
  // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4115
83.4k
  int i;
4116
83.4k
  if (!tensor_arena->pb_vt_tensors)
4117
73
  {
4118
73
    tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4119
7.69k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.62k
)
4120
7.62k
      if (tensor_arena->vt_tensors[i])
4121
6.36k
        tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4122
73
  }
4123
83.4k
  if (!tensor_arena->vt_alias_r_refs_p)
4124
73
  {
4125
73
    tensor_arena->vt_alias_r_refs_p = (int*)cccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4126
73
    tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4127
7.69k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.62k
)
4128
7.62k
      if (tensor_arena->vt_alias_refs[i])
4129
565
      {
4130
565
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4131
565
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
4132
565
        ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4133
565
      }
4134
73
    int refp = 0;
4135
7.69k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.62k
) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4136
7.62k
      if (tensor_arena->vt_alias_r_refs_p[i])
4137
560
        refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4138
7.06k
      else
4139
7.06k
        tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4140
7.13k
    for (i = refp; i < tensor_arena->vt_tensor_size; 
i++7.05k
)
4141
7.05k
      tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4142
7.69k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.62k
)
4143
7.62k
      if (tensor_arena->vt_alias_refs[i])
4144
565
      {
4145
565
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4146
565
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
4147
565
        const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4148
565
        assert(pos >= 0);
4149
565
        tensor_arena->vt_alias_r_refs[pos] = i;
4150
565
      }
4151
73
  }
4152
83.4k
  const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? 
tensor_arena->vt_alias_refs[symbol.d] - 11
:
symbol.d83.4k
;
4153
83.4k
  if (CCV_IS_TENSOR_VIEW(tensor))
4154
0
  {
4155
0
    assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4156
0
    assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&
4157
0
          ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||
4158
0
        (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info));
4159
0
  } else
4160
83.4k
    { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)); }
4161
83.4k
  if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d]))
4162
0
    { assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0); }
4163
83.4k
  tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4164
83.4k
  if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4165
5
    
for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; 2
i < tensor_arena->vt_tensor_size;
i++3
)
4166
5
    {
4167
5
      const int d = tensor_arena->vt_alias_r_refs[i];
4168
5
      if (d < 0 || 
symbol_d + 1 != tensor_arena->vt_alias_refs[d]3
) // Doesn't match, reached the end of it.
4169
2
        break;
4170
3
      ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4171
3
      d_tensor->info.datatype = tensor->info.datatype;
4172
3
      d_tensor->info.reserved = tensor->info.reserved;
4173
3
      if (CCV_IS_TENSOR_VIEW(d_tensor))
4174
1
        ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4175
2
      else {
4176
2
        d_tensor->data.u8 = tensor->data.u8;
4177
2
        d_tensor->dataof = tensor->dataof;
4178
2
      }
4179
3
    }
4180
83.4k
}
4181
4182
void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4183
14.5k
{
4184
14.5k
  if (!tensor_arena->pb_vt_tensors)
4185
34
    return;
4186
14.4k
  int i;
4187
483k
  for (i = 0; i < tensor_arena->vt_tensor_size; 
i++469k
)
4188
469k
    if (tensor_arena->vt_tensors[i])
4189
295k
      tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4190
14.4k
}
4191
4192
uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4193
2
{
4194
2
  uint64_t total_size = 0;
4195
2
  int i;
4196
36
  for (i = 0; i < tensor_arena->buffer_size; 
i++34
)
4197
34
    total_size += tensor_arena->buffers[i].size;
4198
2
  return total_size;
4199
2
}
4200
4201
static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4202
0
{
4203
0
  int i;
4204
0
  if (mv->it)
4205
0
    mv->it->info = params;
4206
0
  for (i = 0; i < mv->repeat + mv->kind; i++)
4207
0
  {
4208
0
    ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i];
4209
0
    if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4210
0
      _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4211
0
    else
4212
0
      tensor->info = params;
4213
0
  }
4214
0
}
4215
4216
int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4217
2.20k
{
4218
2.20k
  int i;
4219
2.20k
  assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size);
4220
2.20k
  if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4221
5
  {
4222
5
    tensor_arena->vt_sizes = (size_t*)ccmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4223
81
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++76
)
4224
76
      if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]52
)
4225
50
      {
4226
50
        ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4227
50
        if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4228
0
        {
4229
0
          ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4230
0
          while (CCV_IS_TENSOR_MULTIVIEW(mv))
4231
0
            mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)[0]);
4232
0
          tensor = (ccv_nnc_tensor_t*)mv;
4233
0
        }
4234
50
        tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4235
50
      }
4236
5
  }
4237
2.20k
  int flag = 0;
4238
22.2k
  for (i = 0; !flag && i < tensor_arena->vt_tensor_size; 
i++20.0k
)
4239
20.0k
    if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]17.6k
)
4240
15.6k
    {
4241
15.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4242
15.6k
      ccv_nnc_tensor_param_t params = symbol_info->info;
4243
15.6k
      params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4244
15.6k
      params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4245
15.6k
      flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4246
15.6k
    }
4247
2.20k
  if (flag)
4248
0
    return -1;
4249
22.2k
  
for (i = 0; 2.20k
i < tensor_arena->vt_tensor_size;
i++20.0k
)
4250
20.0k
    if (tensor_arena->vt_tensors[i])
4251
17.6k
    {
4252
17.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4253
17.6k
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4254
17.6k
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4255
0
      {
4256
0
        assert(!tensor_arena->vt_alias_refs[i]);
4257
0
        _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4258
17.6k
      } else if (!tensor_arena->vt_alias_refs[i]) {
4259
15.6k
        ccv_nnc_tensor_param_t params = symbol_info->info;
4260
15.6k
        params.datatype = tensor->info.datatype;
4261
15.6k
        params.reserved = tensor->info.reserved;
4262
15.6k
        tensor->info = params;
4263
15.6k
      } else {
4264
2.00k
        off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4265
2.00k
        ccv_nnc_tensor_param_t params = symbol_info->info;
4266
2.00k
        params.datatype = tensor->info.datatype;
4267
2.00k
        params.reserved = tensor->info.reserved;
4268
2.00k
        tensor->info = params;
4269
2.00k
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4270
2.00k
        ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4271
2.00k
        if (CCV_IS_TENSOR_VIEW(tensor))
4272
0
        {
4273
0
          ((ccv_nnc_tensor_view_t*)tensor)->off = off;
4274
0
          memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4275
0
        }
4276
2.00k
      }
4277
17.6k
    }
4278
  // Should handle sub_tensor_arena, don't do that at the moment.
4279
2.20k
  assert(!graph->sub_graphs);
4280
2.20k
  return 0;
4281
2.20k
}
4282
4283
void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4284
2.20k
{
4285
2.20k
  assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size);
4286
2.20k
  int i;
4287
11.0k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++8.82k
)
4288
8.82k
  {
4289
8.82k
    const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4290
8.82k
    if (graph_exec.d < 0)
4291
2.41k
      continue;
4292
6.41k
    const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4293
6.41k
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
4294
6.41k
    ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4295
6.41k
    if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4296
6.41k
    {
4297
6.41k
      new_cmd.backend = existing_cmd.backend;
4298
6.41k
      new_cmd.algorithm = existing_cmd.algorithm;
4299
6.41k
    }
4300
6.41k
    ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4301
6.41k
  }
4302
2.20k
}
4303
4304
void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4305
6.38k
{
4306
6.38k
  int i;
4307
22.7k
  for (i = 0; i < tensor_arena->buffer_size; 
i++16.3k
)
4308
16.3k
  {
4309
16.3k
    if (!tensor_arena->buffers[i].ptr)
4310
248
      continue;
4311
16.0k
    const int buffer_type = tensor_arena->buffers[i].type;;
4312
16.0k
    const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
4313
16.0k
#ifdef HAVE_CUDA
4314
16.0k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
4315
16.0k
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
4316
2.35k
    {
4317
2.35k
      if (tensor_arena->allocator.isa && 
tensor_arena->allocator.isa->free266
)
4318
266
        tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4319
2.09k
      else
4320
2.09k
        cufree(device_id, tensor_arena->buffers[i].ptr);
4321
13.7k
    } else {
4322
13.7k
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4323
13.7k
      if (tensor_arena->buffers[i].pin_mem)
4324
17
        cuhostfree(tensor_arena->buffers[i].ptr);
4325
13.7k
      else
4326
13.7k
        ccfree(tensor_arena->buffers[i].ptr);
4327
13.7k
    }
4328
#elif defined(HAVE_MPS)
4329
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
4330
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
4331
    {
4332
      // if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4333
      //  tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4334
      // else
4335
      mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4336
    } else {
4337
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4338
      ccfree(tensor_arena->buffers[i].ptr);
4339
    }
4340
#else
4341
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4342
    ccfree(tensor_arena->buffers[i].ptr);
4343
#endif
4344
16.0k
    tensor_arena->buffers[i].ptr = 0;
4345
16.0k
  }
4346
  // For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4347
6.38k
  if (tensor_arena->disposers)
4348
0
  {
4349
0
    for (i = 0; i < tensor_arena->disposers->rnum; i++)
4350
0
    {
4351
0
      ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i);
4352
0
      disposer->dispose(disposer->ptr, disposer->userdata);
4353
0
    }
4354
0
    ccv_array_free(tensor_arena->disposers);
4355
0
    tensor_arena->disposers = 0;
4356
0
  }
4357
6.38k
}
4358
4359
void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4360
6.17k
{
4361
6.17k
  ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4362
6.17k
  _ccv_nnc_tensor_arena_free(tensor_arena);
4363
6.17k
}
4364
4365
void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4366
6.22k
{
4367
6.22k
  int i;
4368
6.27k
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++50
)
4369
50
    if (graph_exec_arena->sub_arenas[i])
4370
49
      ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4371
6.22k
  ccfree(graph_exec_arena);
4372
6.22k
}