Coverage Report

Created: 2026-04-14 20:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_symbolic_graph_compile.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#include "ccv_nnc_easy.h"
4
#include "ccv_internal.h"
5
#ifdef HAVE_CUDA
6
#include "gpu/ccv_nnc_compat.h"
7
#elif defined(HAVE_MPS)
8
#include "mps/ccv_nnc_mps.h"
9
#endif
10
#include "_ccv_nnc_graph.h"
11
#include "_ccv_nnc_symbolic_graph.h"
12
13
// MARK - Level-3 API
14
15
typedef struct {
16
  int flags;
17
  int type;
18
  int pin_mem; // This memory need to be pinned.
19
  int ref; // Reference to another tensor block. Start with 1.
20
  int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21
  int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22
  int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23
  int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24
  ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25
  uint64_t size; // The size of the tensor expected.
26
  int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27
  ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28
  ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29
  ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30
} ccv_nnc_tensor_block_t; // Tensor Arena Block
31
32
4.34M
#define IS_PRIMARY_COMPANION(idx, block) ((idx) < (uint32_t)((block).companion_ref - 1))
33
34
enum {
35
  UNASSIGNED = 0x1,
36
  ALIAS = 0x2,
37
  READ_ONLY = 0x4,
38
  WRITE_ONLY = 0x8,
39
  READ_WRITE = 0xc,
40
  ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41
  UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42
  UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43
};
44
45
#define TENSOR_EXPECT_ORDINARY(t) ((t.flags & 0x3) == 0)
46
#define TENSOR_EXPECT_SET_ORDINARY(t) (t.flags = (t.flags & ~0x3))
47
5.57M
#define TENSOR_EXPECT_UNASSIGNED(t) ((t.flags & 0x3) == UNASSIGNED)
48
6.44k
#define TENSOR_EXPECT_SET_UNASSIGNED(t) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49
3
#define TENSOR_EXPECT_UNSET_UNASSIGNED(t) (t.flags = (t.flags & ~0x1))
50
9.25M
#define TENSOR_EXPECT_ALIAS(t) ((t.flags & 0x3) == ALIAS)
51
8.60M
#define TENSOR_EXPECT_COMPUTABLE(t) (
!4.39M
TENSOR_EXPECT_ALIAS4.39M
(t) &&
!4.21M
TENSOR_EXPECT_UNASSIGNED4.21M
(t))
52
27.9k
#define TENSOR_READ_WRITE(t) (t.flags & 0xc)
53
6.52k
#define TENSOR_SET_READ_WRITE(t, rw) (t.flags = ((t.flags & ~0xc) | rw))
54
95
#define TENSOR_SET_ANONYMOUS(t) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55
#define TENSOR_IS_ANONYMOUS(t) (t.flags & ANONYMOUS)
56
180
#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57
19.9k
#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t) (t.flags & UNFOLDABLE_AS_INPUT)
58
116
#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59
13.4k
#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t) (t.flags & UNFOLDABLE_AS_OUTPUT)
60
61
119k
#define TENSOR_REQUIRE_INIT(flags) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || 
((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)119k
)
62
63
// Holds additional information about the exe nodes.
64
typedef struct {
65
  int flags;
66
} ccv_nnc_graph_exec_flag_t;
67
68
enum {
69
  CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70
};
71
72
typedef struct {
73
  int index;
74
  int oc;
75
  int type;
76
  uint64_t size;
77
} ccv_nnc_tensor_opt_t;
78
79
// We first sort the same type together (because they won't be reused at all.
80
// And then we sort by size, after that, sort by oc.
81
227k
#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82
227k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)
83
#undef more_than
84
typedef struct {
85
  int idx;
86
  int hop;
87
} ccv_nnc_tensor_hop_t;
88
225k
#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89
225k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)
90
#undef less_than
91
92
// If b has items overlap with a, a is still after b (inclusive).
93
static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94
0
{
95
0
  assert(a);
96
0
  assert(b);
97
0
  int x, y;
98
0
  for (x = 0; x < b->rnum; x++)
99
0
  {
100
0
    const int p = *(int*)ccv_array_get(b, x);
101
0
    int flag = 0;
102
    // In extreme cases where a is a superset of b, then a is still after b, we are good.
103
0
    for (y = 0; !flag && y < a->rnum; y++)
104
0
    {
105
0
      const int q = *(int*)ccv_array_get(a, y);
106
0
      flag = (p == q);
107
0
    }
108
0
    if (!flag)
109
0
      for (y = 0; y < a->rnum; y++)
110
0
      {
111
0
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y), p);
112
0
        if (!cell.i32 || cell.i32[0] == 0)
113
0
          return 0;
114
0
      }
115
0
  }
116
  // If b->rnum == 0, a is after b for sure.
117
  // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118
  // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119
0
  return (a->rnum > 0 || b->rnum == 0);
120
0
}
121
122
static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123
411k
{
124
411k
  assert(a);
125
411k
  assert(b);
126
411k
  if (!a->rnum || 
!b->rnum411k
)
127
4
    return 0;
128
411k
  int x, y, max_hop = 0;
129
484k
  for (x = 0; x < a->rnum; 
x++72.8k
)
130
411k
  {
131
411k
    ccv_sparse_matrix_vector_t* const vector = ccv_get_sparse_matrix_vector(exec_dep, *(int*)ccv_array_get(a, x));
132
411k
    if (!vector)
133
124k
      return 0;
134
360k
    
for (y = 0; 286k
y < b->rnum;
y++73.3k
)
135
287k
    {
136
287k
      const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, *(int*)ccv_array_get(b, y));
137
287k
      if (!cell.i32 || 
cell.i32[0] == 073.3k
)
138
214k
        return 0;
139
73.3k
      if (cell.i32[0] > max_hop)
140
72.8k
        max_hop = cell.i32[0];
141
73.3k
    }
142
286k
  }
143
  // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
144
  // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
145
72.8k
  return max_hop;
146
411k
}
147
148
// If every a's head is deterministically after b's tail
149
static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
150
411k
{
151
411k
  return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
152
411k
}
153
154
typedef struct {
155
  ccv_array_t** alloc_dep;
156
  int vt_block_size;
157
  int buffer_size;
158
  int block_size;
159
  int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
160
  struct {
161
    int type; // The type from tensor blocks.
162
    int pin_mem; // Whether this is pinned memory.
163
    int flags; // The flags (currently for READ_ONLY or not).
164
    uint64_t size; // The size of the buffer allocated.
165
    int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
166
    ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
167
  }* buffers;
168
  struct {
169
    int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
170
    int block_ref; // A reference to which block in the given tensor_block to use.
171
    uint64_t offset; // The offset of this block.
172
  }* blocks;
173
} ccv_nnc_tensor_alloc_prep_t;
174
175
typedef struct ccv_nnc_symbolic_graph_prep_s {
176
  int flags;
177
  int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
178
  int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
179
  int exec_idx;
180
  int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
181
  int tensor_symbol_info_size;
182
  int exec_symbol_info_size;
183
  int tensor_block_size;
184
  int sub_prep_size;
185
  ccv_nnc_tensor_block_t* tensor_blocks;
186
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
187
  ccv_nnc_graph_exec_flag_t* exec_flags;
188
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
189
  int* dup_tensor_block_ref;
190
  ccv_nnc_graph_visit_t* visit;
191
  ccv_nnc_tensor_alloc_prep_t* alloc_prep;
192
  struct ccv_nnc_symbolic_graph_prep_s* p;
193
  struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
194
  // Structures that don't require to be freed after deallocation.
195
  const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
196
  ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
197
  ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
198
  ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
199
} ccv_nnc_symbolic_graph_prep_t;
200
201
typedef struct {
202
  int oc;
203
  ccv_array_t* itf;
204
} ccv_nnc_tensor_block_adjacent_t;
205
206
static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
207
6.26k
{
208
  // Compute how many dis-continuous buffers are needed.
209
  // We prefer to have several dis-continuous buffers instead of one big buffer because
210
  // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
211
  // to fully utilize memory.
212
6.26k
  int i, j, k;
213
6.26k
  ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloc(tensor_block_size, sizeof(ccv_array_t*));
214
6.26k
  int allocable_tensor_size = 0, available_tensor_size = 0;
215
98.0k
  for (i = 0; i < tensor_block_size; 
i++91.7k
)
216
91.7k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
217
30.2k
    {
218
      // Tensors that we need the header info.
219
30.2k
      ++available_tensor_size;
220
30.2k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
221
        // Tensors that we actually need to allocate (exclude the alias).
222
27.5k
        ++allocable_tensor_size;
223
30.2k
    }
224
6.26k
  ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
225
6.26k
  ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
226
6.26k
  ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
227
  // Overlap count.
228
98.0k
  for (i = 0; i < tensor_block_size; 
i++91.7k
)
229
91.7k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
230
1.68M
      
for (j = i + 1; 27.5k
j < tensor_block_size;
j++1.66M
)
231
1.66M
        if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j]))
232
637k
        {
233
          // We only reuse buffers within the same memory type. The tensor_dt / tensor_df
234
          // matrices are only queried later for same-type candidates in this function,
235
          // thus cross-type hop relations are not needed for allocation planning here.
236
637k
          if (tensor_blocks[i].type != tensor_blocks[j].type)
237
431k
            continue;
238
          // Check to see if they interfere (default to yes).
239
          // If any of the i's head is deterministically later than j's tail
240
          // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
241
205k
          const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
242
205k
          int j_hop_i = 0;
243
205k
          if (i_hop_j > 0)
244
175
          {
245
175
            ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
246
175
            ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
247
205k
          } else {
248
            // It cannot be that both directions are positive. If i can hop to j, we don't
249
            // need the reverse hop value for any subsequent allocation decision.
250
205k
            j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
251
205k
            if (j_hop_i > 0)
252
72.6k
            {
253
72.6k
              ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
254
72.6k
              ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
255
72.6k
            }
256
205k
          }
257
205k
          if (!i_hop_j && 
!j_hop_i205k
)
258
133k
          {
259
133k
            if (!adj[i].itf)
260
4.62k
              adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
261
133k
            ccv_array_push(adj[i].itf, &j);
262
133k
            ++adj[i].oc;
263
133k
            if (!adj[j].itf)
264
22.5k
              adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
265
133k
            ccv_array_push(adj[j].itf, &i);
266
133k
            ++adj[j].oc;
267
133k
          }
268
205k
        }
269
6.26k
  const int exec_dep_rows = exec_dep->rows;
270
6.26k
  ccv_matrix_free(exec_dep);
271
6.26k
  ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
272
6.26k
  int* const assigned = (int*)cccalloc(tensor_block_size, sizeof(int));
273
6.26k
  uint64_t* const allocated_offset = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
274
6.26k
  uint64_t* const allocated_size = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
275
6.26k
  uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
276
6.26k
  int num_assigned = 0; 
277
  // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
278
  // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
279
  // The first channel denotes the bytes available for allocation,
280
  // the second channel denotes the offset available for the allocation,
281
6.26k
  ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
282
6.26k
  ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
283
33.7k
  for (j = 0; j < allocable_tensor_size;)
284
27.5k
  {
285
    // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
286
27.5k
    uint64_t max_size = 0;
287
27.5k
    ccv_array_clear(opt);
288
27.5k
    int current_type = 0; // Deal with one type at a time.
289
4.00M
    for (i = 0; i < tensor_block_size; 
i++3.97M
)
290
3.97M
      if (tensor_blocks[i].size >= max_size &&
291
3.97M
        
TENSOR_EXPECT_COMPUTABLE2.08M
(tensor_blocks[i]) &&
!assigned[i]939k
&&
292
3.97M
        
IS_PRIMARY_COMPANION364k
(i, tensor_blocks[i]) &&
293
3.97M
        
(364k
!current_type364k
||
tensor_blocks[i].type == current_type337k
))
294
122k
      {
295
122k
        ccv_nnc_tensor_opt_t a = {
296
122k
          .size = tensor_blocks[i].size,
297
122k
          .index = i,
298
122k
          .oc = adj[i].oc,
299
122k
          .type = tensor_blocks[i].type,
300
122k
        };
301
122k
        assert(a.type);
302
122k
        current_type = a.type; // Now we now the primary type we should deal with.
303
122k
        if (tensor_blocks[i].companion_ref)
304
36
        {
305
36
          const int companion_ref = tensor_blocks[i].companion_ref - 1;
306
36
          a.size = ccv_max(a.size, tensor_blocks[companion_ref].size);
307
36
          a.oc += adj[companion_ref].oc;
308
36
        }
309
        // In case we have a tie, take them all in the array.
310
122k
        if (a.size > max_size)
311
32.1k
          ccv_array_clear(opt), max_size = a.size;
312
122k
        ccv_array_push(opt, &a);
313
122k
      }
314
27.5k
    assert(opt->rnum > 0);
315
    // Order opt array by the oc because type and size should be equal at this point.
316
27.5k
    _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
317
    // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
318
27.5k
    int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
319
27.5k
    uint64_t min_val[2] = {
320
27.5k
      0, 0
321
27.5k
    };
322
27.5k
    if (j > 0)
323
22.6k
    {
324
69.8k
      for (i = 0; i < opt->rnum; 
i++47.1k
)
325
58.3k
      {
326
58.3k
        ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
327
58.3k
        if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
328
28.9k
          continue;
329
        // Now, determine the order between a and c. After this, we can always check whether y
330
        // can hop to the earliest one and if the latest one can hop to x.
331
        // The earliest one will be called p and the latest one will be called q.
332
29.4k
        int p = a.index;
333
29.4k
        int q = a.index;
334
29.4k
        if (tensor_blocks[a.index].companion_ref)
335
18
        {
336
18
          const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
337
18
          if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
338
0
            continue;
339
18
          const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
340
18
          if (b_hop_p.i32 && 
b_hop_p.i32[0] > 01
)
341
1
            p = companion_ref;
342
17
          else {
343
17
            const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
344
17
            if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
345
17
              q = companion_ref;
346
0
            else { // Otherwise, b is in between p and q.
347
0
              const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
348
0
              const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
349
0
              assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0);
350
0
            }
351
17
          }
352
18
        }
353
29.4k
        assert(tensor_blocks[q].type == tensor_blocks[p].type);
354
29.4k
        const int type = tensor_blocks[p].type;
355
        // y is always earlier than x, but this is hard to assert now.
356
        // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
357
        // Thus, the hop between y and x (through a) should be smallest ones.
358
        // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
359
        // out of q. For these nodes, we try to verify whether they form a connection (by checking against
360
        // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
361
29.4k
        int y_size = 0;
362
29.4k
        ccv_nnc_tensor_hop_t* const y_buf = buf;
363
96.1k
#define for_block(y, val) do { \
364
96.1k
          if (((int*)val)[0] > 0 && assigned[y] && 
tensor_blocks[y].type == type35.7k
&&
tensor_blocks[y].size >= a.size35.7k
) \
365
96.1k
            y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
366
35.7k
              .idx = y + 1, .hop = ((int*)val)[0] \
367
35.7k
            }; \
368
96.1k
        } while(0)
369
29.4k
        ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
370
29.4k
        if (y_vector)
371
96.1k
          
CCV_SPARSE_VECTOR_FOREACH17.9k
(tensor_dt, y_vector, for_block);
372
29.4k
#undef for_block
373
29.4k
        assert(y_size <= tensor_block_size);
374
29.4k
        int x_size = 0;
375
29.4k
        ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
376
76.6k
#define for_block(x, val) do { \
377
76.6k
          if (((int*)val)[0] > 0 && assigned[x] && 
tensor_blocks[x].type == type30.8k
&&
tensor_blocks[x].size >= a.size30.8k
) \
378
76.6k
            x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
379
30.8k
              .idx = x + 1, .hop = ((int*)val)[0] \
380
30.8k
            }; \
381
76.6k
        } while(0)
382
29.4k
        ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
383
29.4k
        if (x_vector)
384
76.6k
          
CCV_SPARSE_VECTOR_FOREACH15.9k
(tensor_df, x_vector, for_block);
385
29.4k
#undef for_block
386
29.4k
        assert(y_size + x_size <= tensor_block_size);
387
29.4k
        int x, y;
388
29.4k
        if (y_size > 1)
389
4.33k
          _ccv_nnc_sort_by_hops(y_buf, y_size, 0);
390
41.9k
        for (y = 0; y < y_size; 
y++12.5k
)
391
18.7k
        {
392
18.7k
          const int hop = exec_dep_rows + y_buf[y].hop;
393
18.7k
          if (hop >= min_hop)
394
0
            break;
395
18.7k
          const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
396
18.7k
          if (val.u64 && 
val.u64[0] >= a.size12.6k
)
397
6.26k
          {
398
6.26k
            min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
399
6.26k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
400
6.26k
            break;
401
6.26k
          }
402
18.7k
        }
403
29.4k
        if (x_size > 1)
404
1.50k
          _ccv_nnc_sort_by_hops(x_buf, x_size, 0);
405
41.9k
        for (x = 0; x < x_size; 
x++12.5k
)
406
15.6k
        {
407
15.6k
          const int hop = exec_dep_rows + x_buf[x].hop;
408
15.6k
          if (hop >= min_hop)
409
263
            break;
410
15.3k
          const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
411
15.3k
          if (val.u64 && 
val.u64[0] >= a.size3.54k
)
412
2.83k
          {
413
2.83k
            min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
414
2.83k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
415
2.83k
            break;
416
2.83k
          }
417
15.3k
        }
418
29.4k
        if (x_size > 0)
419
10.1k
        {
420
10.1k
          const int x_min_hop = x_buf[0].hop;
421
15.7k
          for (y = 0; y < y_size; 
y++5.60k
)
422
6.01k
          {
423
6.01k
            const int y_hop_p_v = y_buf[y].hop;
424
6.01k
            if (y_hop_p_v + x_min_hop >= min_hop)
425
408
              break;
426
5.60k
            ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
427
5.60k
            if (y_vector)
428
5.60k
            {
429
52.1k
              for (x = 0; x < x_size; 
x++46.5k
)
430
49.4k
              {
431
49.4k
                const int q_hop_x_v = x_buf[x].hop;
432
49.4k
                const int hop = y_hop_p_v + q_hop_x_v;
433
49.4k
                if (hop >= min_hop)
434
421
                  break;
435
49.0k
                const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
436
49.0k
                if (val.u64 && 
val.u64[0] >= a.size2.60k
)
437
2.51k
                {
438
2.51k
                  min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
439
2.51k
                    min_val[0] = val.u64[0], min_val[1] = val.u64[1];
440
2.51k
                  break;
441
2.51k
                }
442
49.0k
              }
443
5.60k
            }
444
5.60k
          }
445
10.1k
        }
446
        // If I found a place, stop, and exit.
447
29.4k
        if (min_y > 0 || 
min_x < tensor_block_size + 120.8k
)
448
11.1k
        {
449
11.1k
          min_i = i;
450
11.1k
          break;
451
11.1k
        }
452
        // There is no space to insert this block, mark it as such.
453
18.2k
        tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
454
18.2k
        if (tensor_blocks[a.index].companion_ref)
455
18
        {
456
18
          const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
457
18
          tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
458
18
        }
459
18.2k
      }
460
22.6k
    }
461
    // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
462
    // and default to largest size available.
463
27.5k
    ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i));
464
27.5k
    if (min_i == -1)
465
16.3k
    {
466
16.3k
      allocated_size[num_assigned] = a.size;
467
16.3k
      ++num_assigned;
468
16.3k
    }
469
27.5k
    int assign_group = num_assigned;
470
27.5k
    if (min_y > 0)
471
8.57k
    {
472
8.57k
      assign_group = assigned[min_y - 1];
473
      // The y and x should belong to the same assigned group.
474
8.57k
      assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group);
475
18.9k
    } else if (min_x < tensor_block_size + 1)
476
2.58k
      assign_group = assigned[min_x - 1];
477
    // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
478
27.5k
    if (min_y != 0 || 
min_x != tensor_block_size + 118.9k
)
479
11.1k
    {
480
11.1k
      uint64_t val[2] = {
481
11.1k
        min_val[0], min_val[1]
482
11.1k
      };
483
11.1k
      assert(val[0] >= a.size);
484
11.1k
      val[0] -= a.size;
485
11.1k
      val[1] = val[1] + a.size; // Move the offset to the next one.
486
11.1k
      ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
487
11.1k
    }
488
27.5k
    int strings[3];
489
27.5k
    strings[0] = a.index + 1;
490
27.5k
    int string_size = 1;
491
    // Assign out designated companion if it exist.
492
27.5k
    if (tensor_blocks[a.index].companion_ref)
493
20
    {
494
20
      const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
495
20
      assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type);
496
20
      const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
497
20
      if (b_hop_p.i32 && 
b_hop_p.i32[0] > 02
)
498
2
      {
499
4
        for (i = 0; i < string_size; 
i++2
)
500
2
          strings[i + 1] = strings[i];
501
2
        strings[0] = companion_ref + 1;
502
18
      } else {
503
18
        const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
504
18
        if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
505
18
          strings[string_size] = companion_ref + 1;
506
0
        else {
507
          // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
508
0
          assert(string_size == 2);
509
0
          strings[2] = strings[1];
510
0
          strings[1] = companion_ref + 1;
511
0
        }
512
18
      }
513
20
      ++string_size;
514
20
    }
515
    // Assign out and update oc.
516
55.0k
    
for (i = 0; 27.5k
i < string_size;
i++27.5k
)
517
27.5k
    {
518
27.5k
      const int index = strings[i] - 1;
519
      // Assign out the selected one.
520
27.5k
      assigned[index] = assign_group;
521
      // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
522
27.5k
      allocated_offset[index] = min_val[1];
523
27.5k
      if (adj[index].itf)
524
293k
        
for (k = 0; 27.2k
k < adj[index].itf->rnum;
k++266k
)
525
266k
        {
526
266k
          const int d = *(int*)ccv_array_get(adj[index].itf, k);
527
266k
          if (!assigned[d] && 
TENSOR_EXPECT_COMPUTABLE133k
(tensor_blocks[d]))
528
133k
            --adj[d].oc;
529
266k
        }
530
27.5k
    }
531
27.5k
    uint64_t val[2] = {
532
27.5k
      a.size, min_val[1]
533
27.5k
    };
534
27.5k
    uint64_t consumed_size = 0;
535
    // Go over from min_y to string_size (excluding min_x).
536
27.5k
    for (i = 0; i < string_size; 
i++0
)
537
27.5k
    {
538
27.5k
      const uint64_t size = tensor_blocks[strings[i] - 1].size;
539
27.5k
      assert(size <= a.size);
540
      // Update consumed size if it is bigger than "size".
541
27.5k
      if (size > consumed_size)
542
27.5k
      {
543
27.5k
        val[0] = size - consumed_size;
544
27.5k
        ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
545
27.5k
        consumed_size = size;
546
27.5k
        val[1] = min_val[1] + consumed_size;
547
27.5k
      }
548
      // If it consumed all the flow, break out.
549
27.5k
      if (consumed_size == a.size)
550
27.5k
        break;
551
27.5k
    }
552
55.0k
    
for (i = 0; 27.5k
i < string_size;
i++27.5k
)
553
27.5k
    {
554
27.5k
      const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
555
27.5k
      uint64_t val[2] = {
556
27.5k
        i_size, min_val[1]
557
27.5k
      };
558
27.5k
      uint64_t consumed_size = 0;
559
27.5k
      for (k = i + 1; k < string_size; 
k++0
)
560
20
      {
561
20
        const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size);
562
        // Update consumed size if it is bigger than "size".
563
20
        if (size > consumed_size)
564
20
        {
565
20
          val[0] = size - consumed_size;
566
20
          ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
567
20
          consumed_size = size;
568
20
          val[1] = min_val[1] + consumed_size;
569
20
        }
570
        // If it consumed all the flow, break out.
571
20
        if (consumed_size == i_size)
572
20
          break;
573
20
      }
574
27.5k
      val[0] = i_size - consumed_size;
575
      // Still have residual, flow it to min_x.
576
27.5k
      if (val[0] > 0)
577
27.5k
        ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
578
27.5k
    }
579
27.5k
    if (min_i == -1)
580
16.3k
    {
581
      // If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
582
16.3k
      const int p = strings[0] - 1;
583
16.3k
      const int q = strings[string_size - 1] - 1;
584
16.3k
      const int type = tensor_blocks[p].type;
585
16.3k
#define for_block(y, val) 
do 9.01k
{ \
586
9.01k
        if (((int*)val)[0] > 0 && !assigned[y] && 
tensor_blocks[y].type == type5.00k
&&
tensor_blocks[y].size <= a.size5.00k
) \
587
9.01k
        { \
588
5.00k
          tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
589
5.00k
          if (tensor_blocks[y].companion_ref) \
590
5.00k
          { \
591
3
            const int companion_ref = tensor_blocks[y].companion_ref - 1; \
592
3
            tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
593
3
          } \
594
5.00k
        } \
595
9.01k
      } while(0)
596
16.3k
      ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
597
16.3k
      if (y_vector)
598
9.01k
        
CCV_SPARSE_VECTOR_FOREACH4.57k
(tensor_dt, y_vector, for_block);
599
16.3k
#undef for_block
600
29.9k
#define for_block(x, val) do { \
601
29.9k
        if (((int*)val)[0] > 0 && !assigned[x] && 
tensor_blocks[x].type == type14.2k
&&
tensor_blocks[x].size <= a.size14.2k
) \
602
29.9k
        { \
603
14.2k
          tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
604
14.2k
          if (tensor_blocks[x].companion_ref) \
605
14.2k
          { \
606
2
            const int companion_ref = tensor_blocks[x].companion_ref - 1; \
607
2
            tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
608
2
          } \
609
14.2k
        } \
610
29.9k
      } while(0)
611
16.3k
      ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
612
16.3k
      if (x_vector)
613
29.9k
        
CCV_SPARSE_VECTOR_FOREACH9.48k
(tensor_df, x_vector, for_block);
614
16.3k
#undef for_block
615
16.3k
    }
616
27.5k
    j += string_size;
617
27.5k
  }
618
6.26k
  ccfree(tensor_block_cannot_insert);
619
6.26k
  ccfree(buf);
620
6.26k
  ccv_array_free(opt);
621
6.26k
  ccv_matrix_free(tensor_df);
622
6.26k
  ccv_matrix_free(tensor_dt);
623
55.0k
#define for_block(y, x, val) do { \
624
55.0k
    if (((uint64_t*)val)[0] > 0 && 
y > 044.8k
&&
x < tensor_block_size + 128.1k
) \
625
55.0k
    { \
626
11.4k
      if (!alloc_dep[x - 1]) \
627
11.4k
        
alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0)10.9k
; \
628
11.4k
      ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
629
11.4k
    } \
630
55.0k
  } while (0)
631
55.0k
  
CCV_SPARSE_FOREACH6.26k
(alloc, for_block);
632
6.26k
#undef for_block
633
6.26k
  ccv_matrix_free(alloc);
634
98.0k
  for (i = 0; i < tensor_block_size; 
i++91.7k
)
635
91.7k
    if (adj[i].itf)
636
27.2k
      ccv_array_free(adj[i].itf);
637
6.26k
  ccfree(adj);
638
6.26k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
639
6.26k
  alloc_prep->alloc_dep = alloc_dep;
640
6.26k
  alloc_prep->vt_block_size = tensor_block_size;
641
6.26k
  alloc_prep->buffer_size = num_assigned;
642
6.26k
  alloc_prep->block_size = available_tensor_size;
643
6.26k
  alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
644
6.26k
  alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
645
6.26k
  alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
646
6.26k
  memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
647
22.6k
  for (i = 0; i < num_assigned; 
i++16.3k
)
648
16.3k
    alloc_prep->buffers[i].size = allocated_size[i];
649
6.26k
  if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO))
650
0
  {
651
0
    size_t total_size = 0;
652
0
    for (i = 0; i < num_assigned; i++)
653
0
      total_size += allocated_size[i];
654
0
    PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size);
655
0
  }
656
6.26k
  ccfree(allocated_size);
657
6.26k
  j = 0;
658
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
659
98.0k
  for (i = 0; i < tensor_block_size; 
i++91.7k
)
660
91.7k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
661
30.2k
    {
662
30.2k
      alloc_prep->blocks[j].block_ref = i;
663
30.2k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
664
27.5k
      {
665
27.5k
        alloc_prep->vt_blocks[i] = j;
666
        // Also, set its allocations.
667
27.5k
        assert(assigned[i] > 0);
668
27.5k
        const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
669
27.5k
        alloc_prep->blocks[j].offset = allocated_offset[i];
670
27.5k
        if (!alloc_prep->buffers[buffer_ref].type)
671
16.3k
          alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
672
27.5k
        alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || 
tensor_blocks[i].pin_mem27.5k
;
673
27.5k
        alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i]);
674
27.5k
        assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size);
675
27.5k
      } else {
676
2.71k
        alloc_prep->vt_blocks[i] = -1;
677
2.71k
        alloc_prep->blocks[j].buffer_ref = -1;
678
2.71k
        alloc_prep->blocks[j].offset = 0;
679
2.71k
      }
680
30.2k
      ++j;
681
30.2k
    } else
682
61.5k
      alloc_prep->vt_blocks[i] = -1;
683
6.26k
  ccfree(allocated_offset);
684
6.26k
  ccfree(assigned);
685
6.26k
  return alloc_prep;
686
6.26k
}
687
688
static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
689
6.26k
{
690
6.26k
  int i;
691
98.0k
  for (i = 0; i < alloc_prep->vt_block_size; 
i++91.7k
)
692
91.7k
    if (alloc_prep->alloc_dep[i])
693
10.9k
      ccv_array_free(alloc_prep->alloc_dep[i]);
694
22.6k
  for (i = 0; i < alloc_prep->buffer_size; 
i++16.3k
)
695
16.3k
    if (alloc_prep->buffers[i].dup_p_refs)
696
13
      ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
697
6.26k
  ccfree(alloc_prep->alloc_dep);
698
6.26k
  ccfree(alloc_prep);
699
6.26k
}
700
701
// Simple allocator from ccv_array_t.
702
static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
703
76.9k
{
704
76.9k
  int pos = tensor_metadata->rnum;
705
76.9k
  int rsize = (size + 15) / 16;
706
76.9k
  ccv_array_resize(tensor_metadata, pos + rsize);
707
76.9k
  return (pos << 1) + 1;
708
76.9k
}
709
710
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
711
163k
{
712
163k
  assert((pos >> 1) < tensor_metadata->rnum);
713
163k
  return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1);
714
163k
}
715
716
83.9k
#define CCV_NNC_IS_METADATA_POS(ptr) ((uintptr_t)(
ptr590
) & 1)
717
718
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
719
83.4k
{
720
  // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
721
83.4k
  if (!CCV_NNC_IS_METADATA_POS(vt_tensor))
722
0
    return vt_tensor;
723
83.4k
  ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
724
83.4k
  if (tensor->alias_ref && 
CCV_NNC_IS_METADATA_POS100
(tensor->alias_ref))
725
80
  {
726
80
    const int alias_ref = tensor->alias_ref;
727
80
    tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
728
80
    _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
729
80
  }
730
83.4k
  if (CCV_IS_TENSOR_MULTIVIEW(tensor))
731
84
  {
732
84
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
733
84
    int i;
734
84
    const int count = mv->kind + mv->repeat;
735
267
    for (i = 0; i < count; 
i++183
)
736
183
    {
737
183
      if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
738
147
      {
739
147
        const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i];
740
147
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
741
147
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
742
147
      }
743
183
    }
744
    // No need to recursively do parent pointer, otherwise we are in deep rewire.
745
84
    if (mv->p && 
CCV_NNC_IS_METADATA_POS11
(mv->p))
746
0
      mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
747
84
    if (mv->sp)
748
65
      
for (i = 0; 28
i < mv->sp->rnum;
i++37
)
749
37
      {
750
37
        ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
751
37
        if (CCV_NNC_IS_METADATA_POS(*tensor))
752
30
        {
753
30
          const int pos = (int)(intptr_t)*tensor;
754
30
          *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
755
30
          assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
756
30
          _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
757
30
        }
758
37
      }
759
84
  }
760
83.4k
  return tensor;
761
83.4k
}
762
763
typedef struct {
764
  const uint8_t* ptr;
765
  int pos;
766
} ccv_nnc_tensor_block_pos_t;
767
768
static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
769
114
{
770
114
  int i;
771
114
  int unref_block_ref = block_ref;
772
120
  while (prep->tensor_blocks[unref_block_ref].ref)
773
6
    unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
774
114
  int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
775
114
  assert(vt_ref >= 0);
776
114
  assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref);
777
114
  const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
778
114
  uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
779
114
  int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
780
114
  for (i = idx - 1; i >= 0; 
i--0
)
781
114
  {
782
114
    assert(p_ref >= 0);
783
114
    const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
784
114
    const int unroll_count = graph_prep->unroll_count;
785
114
    if (ch[i]) // Prefer the dup side of things.
786
12
      p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
787
114
    int unref_p_ref = p_ref;
788
114
    while (graph_prep->tensor_blocks[unref_p_ref].ref)
789
0
      unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
790
114
    vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
791
114
    const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
792
114
    offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
793
    // If the buffer already exists, prefer that.
794
114
    const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
795
114
    if (ptr)
796
114
    {
797
      // If I have any remaining path that is not covered from 0, I cannot possibly
798
      // have any pointer from buffer (that can only happen if it is not dup).
799
138
      for (--i; i >= 0; 
i--24
)
800
24
        if (ch[i] != 0)
801
0
          return 0;
802
      // Try to find the created tensor block pos in the array, just linear scan.
803
114
      const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
804
114
      ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
805
114
      *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
806
114
      ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
807
114
      return tv_pos;
808
114
    }
809
0
    p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
810
0
  }
811
0
  return 0;
812
114
}
813
814
// Descent from root to the prep level, and compose multiview from there.
815
static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
816
114
{
817
114
  assert(pos_ref);
818
114
  int i;
819
114
  const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
820
114
  const int unroll_count = prep->unroll_count;
821
114
  if (prep == graph_prep)
822
57
  {
823
57
    const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
824
57
    if (!data_pos)
825
0
      return -1;
826
    // Based on ch, go all the way back to find the exact pointer to compose.
827
57
    if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
828
57
      prep->dup_tensor_block_ref &&
829
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] >= 041
&&
830
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref41
)
831
41
    {
832
41
      int pos[unroll_count + 1];
833
41
      pos[0] = data_pos;
834
98
      for (i = 0; i < unroll_count; 
i++57
)
835
57
        pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
836
41
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
837
41
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
838
41
      ccv_nnc_tensor_t* data[unroll_count + 1];
839
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
840
98
        data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
841
41
      ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
842
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
843
98
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
844
41
      *pos_ref = mv_pos;
845
41
    } else {
846
16
      *pos_ref = data_pos;
847
16
    }
848
57
    if (preserve)
849
5
    {
850
      // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
851
      // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
852
      // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
853
      // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
854
      // arena allocated).
855
      // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
856
      // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
857
      // it to a K01 structure.
858
      // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
859
      // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
860
      // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
861
5
      int prev_mv_pos = *pos_ref;
862
5
      if (prev_mv_pos == -1)
863
0
      {
864
0
        prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
865
0
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
866
0
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
867
0
        ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
868
0
          tv,
869
0
        }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
870
0
        CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
871
0
      }
872
5
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
873
5
      ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
874
5
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
875
5
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
876
5
        CCV_NNC_TENSOR_PLACEHOLDER,
877
5
        (ccv_nnc_tensor_t*)prev_mv,
878
5
      }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
879
5
      prev_mv->p = (void*)(intptr_t)mv_pos;
880
5
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
881
5
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
882
5
      *pos_ref = mv_pos;
883
5
    }
884
57
    return 0;
885
57
  }
886
57
  ch[idx] = 0;
887
57
  int pos[unroll_count + 1];
888
57
  pos[0] = 0;
889
57
  const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
890
57
  assert(retval == 0);
891
67
  
for (i = 0; 57
i < unroll_count;
i++10
)
892
10
  {
893
10
    ch[idx] = i + 1;
894
10
    pos[i + 1] = 0;
895
10
    const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
896
10
    if (dup_retval < 0)
897
0
    {
898
0
      assert(i == 0);
899
0
      break;
900
0
    }
901
10
  }
902
  // If current prep has no dup.
903
57
  if (i == 0)
904
47
  {
905
47
    *pos_ref = pos[0];
906
47
    return 0;
907
47
  }
908
10
  ccv_nnc_tensor_t* data[unroll_count + 1];
909
  // Compose to a new multiview.
910
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
911
20
    { assert(pos[i] > 0); }
912
10
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
913
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
914
20
    data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
915
10
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
916
10
  ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
917
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
918
20
    if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER && CCV_IS_TENSOR_MULTIVIEW(data[i]))
919
4
      ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
920
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
921
20
    CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
922
10
  *pos_ref = mv_pos;
923
10
  return 0;
924
10
}
925
926
static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
927
312
{
928
312
  int i;
929
312
  int is_input = 0;
930
312
  assert(node);
931
766
  
for (i = 0; 312
i < node->input_size &&
!is_input529
;
i++454
)
932
454
    if (p_ref == node->inputs[i])
933
153
      is_input = 1;
934
312
  int is_output = 0;
935
725
  for (i = 0; i < node->output_size && 
!is_output465
;
i++413
)
936
413
    if (p_ref == node->outputs[i])
937
167
      is_output = 1;
938
  // Prefer it is an output if it is both the input and the output.
939
312
  if (is_output)
940
167
    return 1;
941
145
  if (is_input)
942
145
    return -1;
943
0
  return 0;
944
145
}
945
946
static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
947
61
{
948
  // No need to check whether to preserve if this is not a while loop.
949
61
  if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
950
8
    return 0;
951
61
  assert
(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)53
;
952
  // If it is unassigned, no need to preserve.
953
53
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
954
2
    return 0;
955
51
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
956
  // If p is not input, no need to preserve at all.
957
51
  if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
958
19
    return 0;
959
32
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
960
32
  assert(vt_ref >= 0);
961
32
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
962
32
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
963
  // If the buffer is a truly read-only one, no need to preserve.
964
32
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
965
6
    return 0;
966
  /* This needs detailed explanation, what does preserve mean?
967
   * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
968
   * also used outside of the while loop, we cannot reuse the memory region of x for
969
   * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
970
   * y uses the same memory region as x). The way to workaround this is by using a different
971
   * memory region for y = x + 1, but for the first iteration, having x pointing to the
972
   * original. During the allocation process, the way to identify whether x should preserve
973
   * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
974
   * parent tensor is the same as the memory region it plans to use in the buffer, then we are
975
   * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
976
   * it is the input tensor whenever that is possible. A tensor block can point to two parent
977
   * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
978
   * tensor whenever that is possible. */
979
26
  if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
980
15
    return 0;
981
  // Otherwise, return 1 because we now need to preserve.
982
11
  return 1;
983
26
}
984
985
static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
986
58
{
987
58
  assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size);
988
  // If it is unassigned, no need to preserve.
989
58
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
990
0
    return 0;
991
  // Only tape var need to force broadcast, otherwise we already share the same memory region.
992
58
  if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
993
54
    return 0;
994
4
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
995
  // If p is not output, no need to broadcast at all.
996
4
  if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
997
3
    return 0;
998
1
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
999
1
  assert(vt_ref >= 0);
1000
1
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
1001
1
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
1002
  // If the buffer is a truly read-only one, no need to broadcast.
1003
1
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
1004
0
    return 0;
1005
  // Otherwise, return 1 because we now need to force broadcast for this tape var.
1006
1
  return 1;
1007
1
}
1008
1009
static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
1010
25
{
1011
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1012
25
  int i;
1013
78
  for (i = 0; i < mv->kind + mv->repeat; 
i++53
)
1014
53
    if (CCV_NNC_MULTIVIEW_DATA(mv)[i] == CCV_NNC_TENSOR_PLACEHOLDER)
1015
8
      CCV_NNC_MULTIVIEW_DATA(mv)[i] = tensor;
1016
45
    else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1017
7
      _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], tensor);
1018
25
}
1019
1020
static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1021
25
{
1022
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1023
25
  int i;
1024
25
  if (mv->sp)
1025
8
    
for (i = 0; 2
i < mv->sp->rnum;
i++6
)
1026
6
    {
1027
6
      ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
1028
6
      if (CCV_NNC_IS_METADATA_POS(*tensor))
1029
1
      {
1030
1
        const int pos = (int)(intptr_t)*tensor;
1031
1
        *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1032
1
        assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
1033
1
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1034
1
      }
1035
6
    }
1036
78
  
for (i = 0; 25
i < mv->kind + mv->repeat;
i++53
)
1037
53
  {
1038
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1039
8
      CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1040
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref))
1041
0
      CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref);
1042
53
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1043
7
      _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1044
53
  }
1045
25
}
1046
1047
static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1048
47
{
1049
  // Go to the root of the graph.
1050
47
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1051
47
  int i;
1052
104
  for (i = 1; prep->p; 
i++57
)
1053
57
    prep = prep->p;
1054
  // Root graph should have no dup tensor blocks.
1055
47
  assert(!prep->dup_tensor_block_ref);
1056
47
  const int c = i;
1057
47
  const ccv_nnc_symbolic_graph_prep_t* preps[c];
1058
47
  prep = graph_prep;
1059
47
  preps[c - 1] = prep;
1060
104
  for (i = 0; prep->p; 
i++57
)
1061
57
    preps[c - 2 - i] = prep = prep->p;
1062
47
  int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1063
47
  memset(ch, 0, sizeof(int) * c);
1064
47
  int pos = 0;
1065
47
  _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1066
47
  assert(ch[c - 1] == 0); // This shouldn't never be modified.
1067
47
  assert(pos > 0);
1068
47
  return pos;
1069
47
}
1070
1071
static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1072
3
{
1073
3
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1074
3
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1075
3
  ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : 
tensor0
;
1076
3
  ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1077
3
    CCV_NNC_TENSOR_PLACEHOLDER,
1078
3
    tv,
1079
3
  }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1080
3
  CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
1081
3
  CCV_NNC_MULTIVIEW_DATA(mv)[1] = tensor;
1082
3
  return mv_pos;
1083
3
}
1084
1085
static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1086
30
{
1087
30
  ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1088
30
  const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr);
1089
30
  if (!is_multiview)
1090
18
    return pos;
1091
24
  
while (12
CCV_IS_TENSOR_MULTIVIEW(tensor_ptr))
1092
12
  {
1093
12
    const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1094
12
    tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1095
12
  }
1096
12
  const ccv_nnc_tensor_t tensor = *tensor_ptr;
1097
12
  const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1098
12
  ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1099
12
  *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1100
12
  new_tensor->dataof = tensor.dataof;
1101
12
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1102
12
  new_tensor->alias_ref = (uintptr_t)pos;
1103
12
  ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1104
12
  return new_pos;
1105
30
}
1106
1107
static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1108
2.69k
{
1109
2.69k
  const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1110
  // It referenced to is not an alias.
1111
2.69k
  assert(vt_tensors[alias_ref]);
1112
2.69k
  const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1113
2.69k
  const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1114
2.69k
  assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr));
1115
  // Will use that to determine whether insert reference or not.
1116
2.69k
  const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr);
1117
2.71k
  while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr))
1118
13
  {
1119
13
    const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1120
13
    alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1121
13
  }
1122
2.69k
  const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1123
  // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1124
2.69k
  int pos;
1125
2.69k
  if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1126
2.69k
    
ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim)2.66k
)
1127
2.63k
  {
1128
2.63k
    pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1129
2.63k
    ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1130
2.63k
    *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1131
2.63k
    tensor->dataof = alias_tensor.dataof;
1132
2.63k
  } else {
1133
59
    pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1134
59
    ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1135
    // Otherwise initialize a tensor view
1136
59
    *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1137
59
    tensor_view->alias_ref = (uintptr_t)alias_pos;
1138
59
  }
1139
2.69k
  vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1140
2.69k
  if (is_multiview)
1141
13
  {
1142
13
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1143
13
    ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1144
13
  }
1145
2.69k
}
1146
1147
static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1148
2.70k
{
1149
  // If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1150
2.70k
  if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]) && 
tensor_blocks[block_ref].alias_ref3
&&
!vt_tensors[block_ref]3
)
1151
3
  {
1152
3
    const int ref = tensor_blocks[block_ref].alias_ref - 1;
1153
3
    if (!vt_tensors[ref])
1154
0
      _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1155
3
    vt_tensors[block_ref] = vt_tensors[ref];
1156
3
    return;
1157
3
  }
1158
2.70k
  assert
(tensor_symbol_info[block_ref].alias_ref)2.69k
;
1159
2.69k
  const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1160
  // If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1161
2.69k
  if (!vt_tensors[alias_ref])
1162
3
    _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1163
2.69k
  _ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1164
2.69k
}
1165
1166
// Turn a linear pointer to an object storage (such as MTLBuffer).
1167
#ifdef HAVE_MPS
1168
static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1169
{
1170
  mpobjfree(0, ptr);
1171
}
1172
#endif
1173
1174
typedef struct {
1175
  size_t size;
1176
  void* obj;
1177
} tensor_arena_obj_track_t;
1178
1179
typedef struct {
1180
  void* ptr;
1181
  off_t offset;
1182
  size_t size;
1183
} obj_ptr_key_t;
1184
1185
static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1186
0
{
1187
0
  return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1188
0
}
1189
1190
static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1191
0
{
1192
0
  return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1193
0
}
1194
1195
KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)
1196
1197
static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1198
27.3k
{
1199
27.3k
  if (params.dim[0] == 0)
1200
0
    return 0;
1201
#ifdef HAVE_MPS
1202
  if (CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_GPU_MEMORY)
1203
  {
1204
    int ret;
1205
    const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype) * ccv_nnc_tensor_count(params);
1206
    const obj_ptr_key_t key = {
1207
      .ptr = ptr,
1208
      .offset = offset,
1209
      .size = size,
1210
    };
1211
    khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret);
1212
    if (ret != 0)
1213
    {
1214
      void* obj = mpobjcreate(ptr, offset, size);
1215
      if (!tensor_arena->disposers)
1216
        tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1217
      ccv_nnc_arena_disposer_t disposer = {
1218
        .ptr = obj,
1219
        .userdata = 0,
1220
        .dispose = _ccv_nnc_tensor_arena_obj_dispose
1221
      };
1222
      ccv_array_push(tensor_arena->disposers, &disposer);
1223
      kh_val(obj_ptr_map, k) = obj;
1224
      return obj;
1225
    } else
1226
      return kh_val(obj_ptr_map, k);
1227
  }
1228
#endif
1229
27.3k
  return ptr + offset;
1230
27.3k
}
1231
1232
static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1233
6.26k
{
1234
  // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1235
  // Each tensor have the designation in assigned array, and offset in allocated_offset.
1236
6.26k
  const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1237
6.26k
  ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1238
6.26k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1239
6.26k
  const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1240
6.26k
  const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1241
6.26k
  const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? 
p_graph_prep->alloc_prep49
:
06.21k
;
1242
6.26k
  const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1243
6.26k
  const int unroll_count = graph_prep->unroll_count;
1244
6.26k
  int i, j;
1245
97.8k
  for (i = 0; i < tensor_symbol_info_size; 
i++91.6k
)
1246
91.6k
    
for (j = 0; 91.6k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) &&
j < unroll_count61.5k
;
j++7
)
1247
7
    {
1248
7
      const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1249
7
      if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref]))
1250
3
        TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i]);
1251
7
    }
1252
6.26k
  ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1253
6.26k
  graph_prep->tensor_arena = tensor_arena;
1254
6.26k
  tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1255
6.26k
  tensor_arena->buffers = (void*)(tensor_arena + 1);
1256
6.26k
  tensor_arena->buffer_size = alloc_prep->buffer_size;
1257
6.26k
  tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1258
6.26k
  tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1259
6.26k
  tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1260
6.26k
  tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1261
6.26k
  tensor_arena->pb_vt_tensors = 0;
1262
6.26k
  tensor_arena->vt_alias_r_refs_p = 0;
1263
6.26k
  tensor_arena->vt_alias_r_refs = 0;
1264
6.26k
  tensor_arena->vt_sizes = 0;
1265
6.26k
  tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1266
6.26k
  tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, (sizeof(ccv_nnc_tensor_t) * tensor_symbol_info_size + 15) / 16, 0);
1267
6.26k
  tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1268
6.26k
  tensor_arena->allocator.context.free = allocator.context.free;
1269
6.26k
  tensor_arena->allocator.isa = allocator.isa;
1270
6.26k
  tensor_arena->disposers = 0;
1271
  // Copy alias_ref info back to the tensor arena.
1272
97.8k
  for (i = 0; i < tensor_symbol_info_size; 
i++91.6k
)
1273
91.6k
    tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1274
  // Do the buffer copies.
1275
22.6k
  for (i = 0; i < alloc_prep->buffer_size; 
i++16.3k
)
1276
16.3k
    tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1277
16.3k
      tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1278
16.3k
      tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1279
6.26k
  if (graph_prep->while_count_tensor)
1280
19
  {
1281
    // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1282
19
    int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1283
19
    assert((0 << 1) + 1 == pos); // pos must be 0 position.
1284
19
    ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1285
19
    *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1286
19
  }
1287
6.26k
  assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep));
1288
6.26k
  if (p_arena && 
p_graph_prep49
)
1289
49
  {
1290
    // Don't need to allocate the actual buffer, just use the pointer from the above.
1291
49
    PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena);
1292
229
    for (i = 0; i < tensor_arena->buffer_size; 
i++180
)
1293
180
    {
1294
180
      const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1295
180
      int unref_p_ref = p_ref;
1296
182
      while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1297
2
        unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1298
180
      assert(unref_p_ref >= 0);
1299
180
      const int p_unroll_count = p_graph_prep->unroll_count;
1300
180
      if (p_graph_prep->dup_tensor_block_ref &&
1301
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 016
&&
1302
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref16
)
1303
10
      {
1304
        // This condition means in the parent graph, we point to multiple tensor blocks for the same
1305
        // buffer, therefore, we cannot have one single pointer assigned in this case.
1306
        // Later we will handle this by generate ccv_tensor_multiview_t structure.
1307
10
        tensor_arena->buffers[i].ptr = 0;
1308
10
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1309
10
        continue;
1310
10
      }
1311
      // Otherwise, find the actual buffer pointer.
1312
170
      const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1313
170
      assert(vt_ref >= 0);
1314
170
      const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1315
170
      if (!p_arena->buffers[buffer_ref].ptr)
1316
0
      {
1317
        // Pass it down as 0 ptr.
1318
0
        tensor_arena->buffers[i].ptr = 0;
1319
0
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1320
0
        continue;
1321
0
      }
1322
170
      const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1323
170
      tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1324
170
      PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset);
1325
170
    }
1326
6.21k
  } else {
1327
    // Now, allocate actual buffers.
1328
6.21k
    PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena);
1329
22.3k
    for (i = 0; i < tensor_arena->buffer_size; 
i++16.1k
)
1330
16.1k
    {
1331
16.1k
      const int buffer_type = tensor_arena->buffers[i].type;
1332
16.1k
      const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
1333
16.1k
#ifdef HAVE_CUDA
1334
16.1k
      if (memory_type == CCV_TENSOR_GPU_MEMORY)
1335
2.37k
      {
1336
2.37k
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
1337
2.37k
        if (allocator.isa && 
allocator.isa->alloc266
)
1338
266
          tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1339
2.10k
        else
1340
2.10k
          tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1341
2.37k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1342
13.7k
      } else {
1343
13.7k
        assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1344
13.7k
        if (tensor_arena->buffers[i].pin_mem)
1345
20
          tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1346
13.7k
        else
1347
13.7k
          ccmemalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1348
13.7k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1349
13.7k
      }
1350
#elif defined(HAVE_MPS)
1351
      if (memory_type == CCV_TENSOR_GPU_MEMORY)
1352
      {
1353
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
1354
        // if (allocator.isa && allocator.isa->alloc)
1355
        //  tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1356
        // else
1357
        tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1358
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1359
      } else {
1360
        assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1361
        ccmemalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1362
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1363
      }
1364
#else
1365
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1366
      ccmemalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1367
      PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1368
#endif
1369
16.1k
      assert(tensor_arena->buffers[i].ptr);
1370
16.1k
    }
1371
6.21k
  }
1372
  // Go over sub_preps and allocate arenas for them. Do it this early because
1373
  // we may reference tensors from sub arenas, the reason why we need to reference
1374
  // tensors from sub arenas is because for output tensors, sub arena's tensor
1375
  // will have automatic reference updates.
1376
6.31k
  
for (i = 0; 6.26k
i < tensor_arena->sub_arena_size;
i++50
)
1377
50
    if (graph_prep->sub_preps[i])
1378
49
      tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1379
1
    else
1380
1
      tensor_arena->sub_arenas[i] = 0;
1381
6.26k
  memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1382
  // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1383
6.26k
  ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? 
(ccv_nnc_tensor_t**)29
cccalloc29
(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) :
06.23k
;
1384
#ifdef HAVE_MPS
1385
  khash_t(obj_ptr)* obj_ptr_map = kh_init(obj_ptr);
1386
#else
1387
6.26k
  khash_t(obj_ptr)* obj_ptr_map = 0;
1388
6.26k
#endif
1389
6.31k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1390
50
    if (tensor_arena->sub_arenas[i])
1391
49
    {
1392
49
      assert(graph_prep->sub_preps[i]);
1393
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1394
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1395
49
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1396
45
        
for (j = 0; 21
j < node->output_size;
j++24
)
1397
24
        {
1398
24
          const int idx = node->outputs[j];
1399
24
          const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i) - 1;
1400
24
          assert(s_idx >= 0);
1401
24
          ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1402
24
          assert(sub_arena_out_tensors[idx] == 0);
1403
24
          ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1404
          // Only assign if it is a multiview tensor.
1405
24
          if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor) ||
1406
24
            
(8
sub_alias8
&&
CCV_IS_TENSOR_MULTIVIEW1
(sub_alias)))
1407
17
            sub_arena_out_tensors[idx] = sub_tensor;
1408
24
        }
1409
49
    }
1410
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
1411
97.8k
  
for (i = 0; 6.26k
i < tensor_symbol_info_size;
i++91.6k
)
1412
91.6k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
1413
27.4k
    {
1414
27.4k
      const int vt_ref = alloc_prep->vt_blocks[i];
1415
27.4k
      const int buffer_ref = vt_ref >= 0 ? 
alloc_prep->blocks[vt_ref].buffer_ref27.4k
:
-13
;
1416
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1417
      // previous layer, therefore, cannot really find the buffer ptr.
1418
27.4k
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[i]101
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1419
27.4k
        
(27.4k
(27.4k
graph_prep->dup_tensor_block_ref27.4k
&&
1420
27.4k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] >= 059
&&
1421
27.4k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] != i57
) ||
1422
27.4k
         
(27.3k
buffer_ref >= 027.3k
&&
!tensor_arena->buffers[buffer_ref].ptr27.3k
)))
1423
47
      {
1424
47
        assert(graph_prep->p); // This must be in a sub-graph.
1425
        // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1426
47
        if (graph_prep->tensor_blocks[i].p_refs[0] && 
_ccv_nnc_tensor_block_check_preserve(graph_prep, i)36
)
1427
4
          continue;
1428
43
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1429
43
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1430
43
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1431
27.3k
      } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])) {
1432
        // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1433
27.3k
        const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1434
        // If already created, use the same tensor, and continue.
1435
        // Having ptr.
1436
27.3k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1437
27.3k
        ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1438
        // Also, set its allocations.
1439
        // Since tensor view is bit compatible with tensor, we can just cast.
1440
27.3k
        void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1441
27.3k
        *tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1442
27.3k
        assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size);
1443
        // If we need to force broadcast, we need to wrap it in a multiview.
1444
27.3k
        if (graph_prep->tensor_blocks[i].p_refs[0] &&
1445
27.3k
          
_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i)58
)
1446
1
        {
1447
1
          const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1448
1
          ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1449
1
          ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1450
1
          ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1451
1
            tv,
1452
1
          }, 0, 1, graph_prep->graph, mv);
1453
1
          CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1454
1
          pos = mv_pos;
1455
1
          ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1456
1
        }
1457
27.3k
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1458
27.3k
      }
1459
27.4k
    }
1460
#ifdef HAVE_MPS
1461
  kh_destroy(obj_ptr, obj_ptr_map);
1462
#endif
1463
  // Handle binded tensors. First handle cases without aliases.
1464
53.9k
  
for (i = 0; 6.26k
i < tensor_bind_size;
i++47.6k
)
1465
47.6k
  {
1466
47.6k
    assert(tensor_binds[i].tensor);
1467
47.6k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1468
47.6k
    if (resolved_symbol.d >= 0)
1469
47.6k
    {
1470
47.6k
      int d = resolved_symbol.d;
1471
47.6k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1472
1.02k
        continue;
1473
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1474
      // It has nothing to do with alias.
1475
46.8k
      
while (46.6k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1476
146
        d = tensor_blocks[d].ref - 1;
1477
      // For binded tensors, it shouldn't be assigned yet.
1478
      // If it is assigned, the pointer should match the ones from the binded tensor.
1479
      // This can only happen if an enforced in-place tensor is binded twice. If that
1480
      // happens, we need to make sure it is binded to the same location.
1481
46.6k
      assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1482
      // See above assertion.
1483
46.6k
      if (tensor_arena->vt_tensors[d])
1484
0
        continue;
1485
46.6k
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1486
0
      {
1487
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1488
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1489
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1490
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1491
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1492
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1493
        // It is OK to be just as a whole smaller or equal to the binded one.
1494
0
        assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info));
1495
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1496
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1497
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1498
46.6k
      } else {
1499
46.6k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1500
46.6k
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1501
46.6k
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1502
46.6k
        tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1503
46.6k
        tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1504
46.6k
        tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1505
46.6k
        tv->dataof = tensor_binds[i].tensor->dataof;
1506
46.6k
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1507
46.6k
      }
1508
46.6k
    }
1509
47.6k
  }
1510
  // Handle binded tensors. We handle alias here so it can reference to binded tensors.
1511
53.9k
  
for (i = 0; 6.26k
i < tensor_bind_size;
i++47.6k
)
1512
47.6k
  {
1513
47.6k
    assert(tensor_binds[i].tensor);
1514
47.6k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1515
47.6k
    if (resolved_symbol.d >= 0)
1516
47.6k
    {
1517
47.6k
      int d = resolved_symbol.d;
1518
47.6k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1519
1.02k
        d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1520
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1521
      // It has nothing to do with alias.
1522
47.8k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1523
146
        d = tensor_blocks[d].ref - 1;
1524
47.6k
      if (tensor_arena->vt_tensors[d])
1525
47.6k
        continue;
1526
      // Assert original alias has no ofs. Otherwise our binding will be problematic.
1527
26
      
for (j = 0; 2
j < CCV_NNC_MAX_DIM_ALLOC;
j++24
)
1528
24
        { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0); }
1529
2
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1530
0
      {
1531
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1532
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1533
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1534
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1535
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1536
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1537
        // It is OK to be just as a whole smaller or equal to the binded one.
1538
0
        assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info));
1539
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1540
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1541
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1542
2
      } else {
1543
2
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1544
2
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1545
2
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1546
2
        tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1547
2
        tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1548
2
        tv->data = tensor_binds[i].tensor->data;
1549
2
        tv->dataof = tensor_binds[i].tensor->dataof;
1550
2
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1551
2
      }
1552
2
    }
1553
47.6k
  }
1554
  // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1555
  // Avoiding refs that actually is an alias.
1556
97.8k
  
for (i = 0; 6.26k
i < tensor_symbol_info_size;
i++91.6k
)
1557
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1558
91.6k
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_blocks[i].ref61.5k
&&
!tensor_arena->vt_tensors[i]6.41k
&&
!tensor_blocks[i].alias_ref6.41k
)
1559
6.23k
    {
1560
6.23k
      int ref = tensor_blocks[i].ref - 1;
1561
6.23k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
tensor_blocks[ref].ref149
)
1562
1
        ref = tensor_blocks[ref].ref - 1;
1563
6.23k
      assert(tensor_arena->vt_tensors[ref]);
1564
6.23k
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1565
6.23k
    }
1566
  // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1567
6.26k
  if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1568
21
  {
1569
21
    assert(graph_prep->p);
1570
21
    const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1571
21
    const int p_idx = graph_prep->p_idx - 1;
1572
46
    for (i = 0; i < node->input_size; 
i++25
)
1573
25
    {
1574
25
      const int idx = node->inputs[i];
1575
25
      int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx) - 1;
1576
25
      assert(!tensor_blocks[block_ref].ref);
1577
25
      const int vt_ref = alloc_prep->vt_blocks[block_ref];
1578
25
      if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1579
18
        continue;
1580
25
      assert
(vt_ref >= 0)7
;
1581
7
      const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1582
7
      assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]));
1583
7
      assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]));
1584
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1585
      // previous layer, therefore, cannot really find the buffer ptr.
1586
7
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[block_ref]0
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1587
7
        ((graph_prep->dup_tensor_block_ref &&
1588
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 04
&&
1589
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref4
) ||
1590
7
         
!tensor_arena->buffers[buffer_ref].ptr3
))
1591
4
      {
1592
        // We haven't allocated anything for this yet.
1593
4
        assert(tensor_arena->vt_tensors[block_ref] == 0);
1594
4
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1595
4
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1596
4
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1597
4
      } else {
1598
3
        const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1599
3
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1600
3
        ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1601
3
      }
1602
7
    }
1603
21
  }
1604
  // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1605
  // This created the multi-view tensor to achieve that.
1606
97.8k
  
for (i = 0; 6.26k
i < tensor_symbol_info_size;
i++91.6k
)
1607
91.6k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1608
10
    {
1609
10
      const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1610
      // Create phi multi-view.
1611
10
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1612
10
      const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1613
10
      const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1614
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1615
10
      ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1616
10
      ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1617
10
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1618
10
        intv,
1619
10
        outv,
1620
10
      }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI, mv);
1621
10
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1622
10
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1623
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1624
10
      ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1625
10
    }
1626
  // Now it is time to handle alias.
1627
36.5k
  for (i = 0; i < alloc_prep->block_size; 
i++30.2k
)
1628
30.2k
    if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1629
30.1k
    {
1630
30.1k
      const int block_ref = alloc_prep->blocks[i].block_ref;
1631
30.1k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]) && 
!tensor_arena->vt_tensors[block_ref]2.69k
)
1632
2.69k
      {
1633
        // Assigning out the tensor aliases.
1634
2.69k
        assert(tensor_symbol_info[block_ref].alias_ref);
1635
2.69k
        _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1636
2.69k
      }
1637
30.1k
    }
1638
  // Now assigning out the rest of alias refs.
1639
97.8k
  
for (i = 0; 6.26k
i < tensor_symbol_info_size;
i++91.6k
)
1640
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1641
91.6k
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_blocks[i].alias_ref61.5k
&&
!tensor_arena->vt_tensors[i]180
)
1642
177
    {
1643
177
      int ref = tensor_blocks[i].alias_ref - 1;
1644
177
      assert(tensor_arena->vt_tensors[ref]);
1645
177
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1646
177
    }
1647
  // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1648
6.31k
  
for (i = 0; 6.26k
i < tensor_arena->sub_arena_size;
i++50
)
1649
50
    if (tensor_arena->sub_arenas[i])
1650
49
    {
1651
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1652
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1653
138
      for (j = 0; j < node->input_size; 
j++89
)
1654
89
      {
1655
89
        const int idx = node->inputs[j];
1656
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1657
89
        if (s_idx < 0)
1658
23
          continue;
1659
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1660
        // Only do the replacement if it is a multi-view tensor.
1661
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1662
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor) &&
!18
TENSOR_EXPECT_UNASSIGNED18
(tensor_blocks[idx]))
1663
18
        {
1664
          // It cannot be binded tensor.
1665
18
          assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]));
1666
18
          const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1667
18
          const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1668
18
          ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? 
sub_arena_out_tensors[idx]1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)17
;
1669
          // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1670
          // to this tensor.
1671
18
          if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor))
1672
6
          {
1673
6
            const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1674
6
            ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1675
6
            ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? 
vt_tensor1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)5
);
1676
6
            ref_tensor->alias_ref = is_sub_arena_out_tensor ? 
(uintptr_t)vt_tensor1
:
(uintptr_t)vt_pos5
;
1677
6
            ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1678
6
            ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0]) ? 
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)5
CCV_NNC_MULTIVIEW_DATA5
(multiview)[0]) :
CCV_NNC_MULTIVIEW_DATA1
(multiview)[0]1
);
1679
6
            while (CCV_IS_TENSOR_MULTIVIEW(tv))
1680
0
              tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]);
1681
6
            *ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1682
6
            ref_tensor->data = tv->data;
1683
6
            ref_tensor->dataof = tv->dataof;
1684
6
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1685
6
          } else
1686
12
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? 
vt_tensor0
: (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1687
18
        }
1688
66
      }
1689
49
    }
1690
  // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1691
  // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1692
  // when initialize case..of node, which will take the phi multi-view again.
1693
97.8k
  
for (i = 0; 6.26k
i < tensor_symbol_info_size;
i++91.6k
)
1694
91.6k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1695
10
    {
1696
10
      assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]));
1697
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1698
10
      assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI);
1699
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1700
10
    }
1701
  // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1702
97.8k
  
for (i = 0; 6.26k
i < tensor_symbol_info_size;
i++91.6k
)
1703
91.6k
    if (tensor_arena->vt_tensors[i])
1704
83.1k
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1705
  // Associate multiview tensors from sub arena to the parent.
1706
6.26k
  if (sub_arena_out_tensors)
1707
29
  {
1708
240
    for (i = 0; i < alloc_prep->block_size; 
i++211
)
1709
211
      if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1710
111
      {
1711
111
        const int block_ref = alloc_prep->blocks[i].block_ref;
1712
111
        if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))
1713
0
          continue;
1714
111
        int sub_arena_ref = block_ref;
1715
111
        if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1716
10
        {
1717
          // Assigning out the tensor aliases.
1718
10
          assert(tensor_symbol_info[block_ref].alias_ref);
1719
10
          const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1720
          // It referenced to is not an alias.
1721
10
          assert(tensor_arena->vt_tensors[alias_ref]);
1722
10
          sub_arena_ref = alias_ref;
1723
10
          if (!sub_arena_out_tensors[sub_arena_ref])
1724
3
            continue;
1725
10
        }
1726
108
        if (!sub_arena_out_tensors[sub_arena_ref])
1727
84
          continue;
1728
24
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref]) ? 
sub_arena_out_tensors[sub_arena_ref]23
:
(ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref1
);
1729
24
        assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1730
        // This is only possible if the vt_tensors is a phi node.
1731
24
        if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1732
0
        {
1733
          // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1734
0
          ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1735
0
          assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI);
1736
0
          assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]));
1737
0
          CCV_NNC_MULTIVIEW_DATA(phi)[1]->alias_ref = (uintptr_t)mv;
1738
0
          ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)[1]);
1739
24
        } else {
1740
24
          tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1741
24
          ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1742
24
        }
1743
24
      }
1744
29
  }
1745
  // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1746
  // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1747
  // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1748
  // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1749
  // to the output of assign_ref tensor.
1750
97.8k
  
for (i = 0; 6.26k
i < tensor_symbol_info_size;
i++91.6k
)
1751
91.6k
    if (tensor_arena->vt_tensors[i] && 
tensor_symbol_info[i].assign_ref83.1k
)
1752
25
    {
1753
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1754
25
      ccv_nnc_tensor_t* assign_tensor;
1755
25
      if (sub_arena_out_tensors && 
sub_arena_out_tensors[assign_ref]3
)
1756
0
        assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref]) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1757
25
      else
1758
25
        assign_tensor = tensor_arena->vt_tensors[assign_ref];
1759
25
      ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1760
25
    }
1761
  // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1762
53.9k
  for (i = 0; i < tensor_bind_size; 
i++47.6k
)
1763
47.6k
  {
1764
47.6k
    assert(tensor_binds[i].tensor);
1765
47.6k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1766
47.6k
    if (resolved_symbol.d >= 0)
1767
47.6k
    {
1768
47.6k
      int d = resolved_symbol.d;
1769
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1770
      // It has nothing to do with alias.
1771
47.8k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref46.8k
)
1772
146
        d = tensor_blocks[d].ref - 1;
1773
      // Note we don't trace back on alias. This is intentional.
1774
47.6k
      assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1775
47.6k
    }
1776
47.6k
  }
1777
6.26k
  if (sub_arena_out_tensors)
1778
29
    ccfree(sub_arena_out_tensors);
1779
  // Rewire sub arena's tensor references.
1780
6.31k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1781
50
    if (tensor_arena->sub_arenas[i])
1782
49
    {
1783
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1784
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1785
138
      for (j = 0; j < node->input_size; 
j++89
)
1786
89
      {
1787
89
        const int idx = node->inputs[j];
1788
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1789
89
        if (s_idx < 0)
1790
23
          continue;
1791
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1792
        // Only do the replacement if it is a multi-view tensor.
1793
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1794
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor))
1795
18
        {
1796
          // This is binded tensor, bind it now.
1797
18
          if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx]))
1798
0
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1799
18
          else
1800
18
            _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1801
18
        }
1802
66
      }
1803
49
    }
1804
6.26k
  return tensor_arena;
1805
6.26k
}
1806
1807
static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1808
17
{
1809
17
  assert(graph);
1810
17
  if ((intptr_t)graph == tensor_arena->graph_ref)
1811
7
  {
1812
7
    assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size);
1813
7
    return tensor_arena->vt_tensors[pair_ref];
1814
7
  }
1815
10
  int i;
1816
13
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++3
)
1817
10
    if (tensor_arena->sub_arenas[i])
1818
10
    {
1819
10
      ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1820
10
      if (tensor)
1821
7
        return tensor;
1822
10
    }
1823
3
  return 0;
1824
10
}
1825
1826
static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1827
7
{
1828
7
  if (!CCV_IS_TENSOR_MULTIVIEW(tensor))
1829
5
    tensor->type |= CCV_TAPE_ALLOC;
1830
2
  else {
1831
2
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1832
2
    mv->type |= CCV_TAPE_ALLOC;
1833
2
    int i;
1834
5
    for (i = 0; i < mv->repeat + mv->kind; 
i++3
)
1835
3
      _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1836
2
  }
1837
7
}
1838
1839
static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1840
6.26k
{
1841
6.26k
  assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
1842
6.26k
  int i;
1843
97.8k
  for (i = 0; i < graph_prep->tensor_symbol_info_size; 
i++91.6k
)
1844
91.6k
  {
1845
91.6k
    if (graph_prep->tensor_symbol_info[i].pair_ref)
1846
7
    {
1847
7
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1848
      // No need to continue check this if it is from its pair.
1849
7
      continue;
1850
7
    }
1851
91.6k
    if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && 
tensor_arena->vt_tensors[i]7
)
1852
7
    {
1853
      // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1854
7
      if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i]))
1855
5
      {
1856
5
        const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1857
5
        if (vt_ref >= 0 &&
1858
5
          TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref]) == READ_ONLY)
1859
3
          continue;
1860
5
      }
1861
4
      _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1862
4
    }
1863
91.6k
  }
1864
6.31k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
1865
50
    if (graph_prep->sub_preps[i])
1866
49
      _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1867
6.26k
}
1868
1869
static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1870
129k
{
1871
129k
  int i, found = 0;
1872
  // Try to insert head.
1873
129k
  ccv_array_t* head = tensor_blocks.head;
1874
129k
  assert(head);
1875
131k
  
for (i = 0; 129k
i < head->rnum;)
1876
60.9k
  {
1877
60.9k
    const int head_idx = *(int*)ccv_array_get(head, i);
1878
60.9k
    if (head_idx == idx)
1879
120
    {
1880
120
      found = 1;
1881
120
      break;
1882
120
    }
1883
60.8k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1884
60.8k
    if (cell.i32 && 
cell.i32[0] > 041
)
1885
41
    {
1886
      /* If the current node is the parent of the head node, check if we found it or not. */
1887
      /* If not found, replace the current one. */
1888
41
      if (!found)
1889
41
      {
1890
41
        found = 1;
1891
41
        *(int*)ccv_array_get(head, i) = idx;
1892
41
      } else {
1893
        /* Remove the current one, change the rnum. */
1894
0
        if (i < head->rnum - 1)
1895
0
          *(int*)ccv_array_get(head, i) = *(int*)ccv_array_get(head, head->rnum - 1);
1896
0
        --head->rnum;
1897
0
        continue;
1898
0
      }
1899
60.7k
    } else {
1900
      // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1901
60.7k
      cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1902
60.7k
      if (cell.i32 && 
cell.i32[0] > 058.6k
)
1903
58.6k
      {
1904
58.6k
        found = 1;
1905
58.6k
        break;
1906
58.6k
      }
1907
60.7k
    }
1908
    /* Advancing i. */
1909
2.17k
    ++i;
1910
2.17k
  }
1911
  /* If not found, push this idx to the end of the array. */
1912
129k
  if (!found)
1913
70.3k
    ccv_array_push(head, &idx);
1914
  // Try to insert tail.
1915
129k
  found = 0;
1916
129k
  ccv_array_t* tail = tensor_blocks.tail;
1917
129k
  assert(tail);
1918
186k
  
for (i = 0; 129k
i < tail->rnum;)
1919
62.1k
  {
1920
62.1k
    const int tail_idx = *(int*)ccv_array_get(tail, i);
1921
62.1k
    if (tail_idx == idx)
1922
4.49k
    {
1923
4.49k
      found = 1;
1924
4.49k
      break;
1925
4.49k
    }
1926
57.6k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1927
57.6k
    if (cell.i32 && 
cell.i32[0] > 055.2k
)
1928
55.2k
    {
1929
      /* If the current node is the child of the tail node, check if we found it or not. */
1930
      /* If not found, replace the current one. */
1931
55.2k
      if (!found)
1932
54.1k
      {
1933
54.1k
        found = 1;
1934
54.1k
        *(int*)ccv_array_get(tail, i) = idx;
1935
54.1k
      } else {
1936
        /* Remove the current one, change the rnum. */
1937
1.13k
        *(int*)ccv_array_get(tail, i) = *(int*)ccv_array_get(tail, tail->rnum - 1);
1938
1.13k
        --tail->rnum;
1939
1.13k
        continue;
1940
1.13k
      }
1941
55.2k
    } else {
1942
      // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1943
2.37k
      cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1944
2.37k
      if (cell.i32 && 
cell.i32[0] > 0110
)
1945
110
      {
1946
110
        found = 1;
1947
110
        break;
1948
110
      }
1949
2.37k
    }
1950
    /* Advancing i. */
1951
56.3k
    ++i;
1952
56.3k
  }
1953
  /* If not found, push this idx to the end of the array. */
1954
129k
  if (!found)
1955
70.4k
    ccv_array_push(tail, &idx);
1956
129k
}
1957
1958
ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1959
7.09k
{
1960
7.09k
  if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1961
6.99k
  {
1962
6.99k
    assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size);
1963
6.99k
    ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1964
6.99k
    if (tensor && 
CCV_IS_TENSOR_MULTIVIEW6.99k
(tensor))
1965
11
    {
1966
11
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1967
22
      while (CCV_IS_TENSOR_MULTIVIEW(mv))
1968
11
        mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? 
mv->it1
:
CCV_NNC_MULTIVIEW_DATA10
(mv)[0]10
);
1969
11
      return (ccv_nnc_tensor_t*)mv;
1970
11
    }
1971
6.98k
    return tensor;
1972
6.99k
  }
1973
100
  int i;
1974
123
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++23
)
1975
99
    if (tensor_arena->sub_arenas[i])
1976
99
    {
1977
99
      ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1978
99
      if (tensor)
1979
76
        return tensor;
1980
99
    }
1981
24
  return 0;
1982
100
}
1983
1984
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1985
66.7k
{
1986
66.7k
  if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1987
66.7k
  {
1988
66.7k
    assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size);
1989
66.7k
    return graph_exec_arena->graph_execs[symbol.d];
1990
66.7k
  }
1991
7
  int i;
1992
9
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++2
)
1993
7
    if (graph_exec_arena->sub_arenas[i])
1994
7
    {
1995
7
      ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1996
7
      if (!CCV_NO_GRAPH_EXEC(exec))
1997
5
        return exec;
1998
7
    }
1999
2
  return (ccv_nnc_graph_exec_t){}; // 0.
2000
7
}
2001
2002
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2003
9
{
2004
9
  return graph_exec_arena->source;
2005
9
}
2006
2007
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2008
9
{
2009
9
  return graph_exec_arena->destination;
2010
9
}
2011
2012
// Check whether the head is the beginning of this block.
2013
static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
2014
50
{
2015
50
  assert(tensor_block->head);
2016
50
  return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0) == head_node);
2017
50
}
2018
2019
// Check whether the tail is the end of this block.
2020
static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2021
39
{
2022
39
  assert(tensor_block->tail);
2023
39
  return (tensor_block->tail->rnum == 1 && 
*(int*)36
ccv_array_get36
(tensor_block->tail, 0) == tail_node);
2024
39
}
2025
2026
// Make two tensor blocks one. Return 1 if that happened.
2027
static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2028
6.76k
{
2029
  // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2030
6.76k
  if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]) &&
2031
6.76k
    
(6.72k
!6.72k
TENSOR_IS_UNFOLDABLE_AS_OUTPUT6.72k
(tensor_blocks[p_ref_1]) ||
tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 118
) &&
2032
6.76k
    
tensor_blocks[p_ref_0].tail->rnum == 16.71k
&&
2033
6.76k
    
tensor_blocks[p_ref_1].head->rnum == 16.71k
&&
2034
6.76k
    
tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type6.71k
&& // Must be the same type.
2035
6.76k
    
*(int*)6.70k
ccv_array_get6.70k
(tensor_blocks[p_ref_0].tail, 0) == *(int*)
ccv_array_get6.70k
(tensor_blocks[p_ref_1].head, 0))
2036
6.42k
  {
2037
    // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2038
6.42k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]));
2039
6.42k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]));
2040
6.42k
    ccv_array_free(tensor_blocks[p_ref_0].tail);
2041
6.42k
    tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2042
6.42k
    if (tensor_blocks[p_ref_1].p_refs[0])
2043
14
    {
2044
14
      assert(tensor_blocks[p_ref_1].p_refs[1] == 0); // It simply cannot have more than one p_refs, otherwise we cannot merge.
2045
14
      if (!tensor_blocks[p_ref_0].p_refs[0])
2046
10
        tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2047
4
      else
2048
4
        tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2049
14
    }
2050
6.42k
    tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2051
6.42k
    TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]));
2052
6.42k
    ccv_array_free(tensor_blocks[p_ref_1].head);
2053
6.42k
    if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1]))
2054
16
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]);
2055
    // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2056
6.42k
    TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1]);
2057
6.42k
    tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2058
6.42k
    if (!tensor_blocks[p_ref_0].r_refs)
2059
6.23k
      tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2060
6.42k
    ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2061
6.42k
    tensor_blocks[p_ref_1].size = 0;
2062
6.42k
    tensor_blocks[p_ref_1].head = 0;
2063
6.42k
    tensor_blocks[p_ref_1].tail = 0;
2064
6.42k
    return 1;
2065
6.42k
  }
2066
335
  return 0;
2067
6.76k
}
2068
2069
static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2070
6.27k
{
2071
6.27k
  int i, j, k;
2072
  // Generate exec dependencies (or, in other words, partial ordering of executions).
2073
6.27k
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2074
6.27k
  int* buf = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2075
6.27k
  int buf_size;
2076
6.27k
  if (p_node_info)
2077
62
    { assert(output_size == 0); }
2078
6.27k
#define for_block(x, val) \
2079
212k
  do { \
2080
212k
    if (((int32_t*)val)[0] > 0) \
2081
212k
    { \
2082
212k
      buf[buf_size * 2] = x; \
2083
212k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2084
212k
      ++buf_size; \
2085
212k
    } \
2086
212k
  } while (0)
2087
32.3k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
2088
32.3k
    buf_size = 0; /* save all its parent deps to this buffer */
2089
32.3k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2090
32.3k
    if (vector)
2091
212k
      
CCV_SPARSE_VECTOR_FOREACH25.7k
(exec_dep, vector, for_block);
2092
32.3k
    if (!node->outgoings)
2093
6.94k
      continue;
2094
53.5k
    
for (i = 0; 25.3k
i < node->outgoings->rnum;
i++28.1k
)
2095
28.1k
    {
2096
28.1k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
2097
28.1k
      const int32_t one = 1;
2098
28.1k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2099
      /* If not found, set, if the current node is the destination node, no need 
2100
       * set itself as parent of subsequent nodes because its terminal nature. */
2101
28.1k
      if (!cell.i32 || 
cell.i32[0] == 00
)
2102
28.1k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2103
28.1k
      if (buf_size > 0)
2104
22.7k
      {
2105
22.7k
        ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2106
22.7k
        assert(vector);
2107
257k
        
for (j = 0; 22.7k
j < buf_size;
j++234k
) /* set with all idx's dependencies as well */
2108
234k
        {
2109
234k
          ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2110
          /* If not found, set */
2111
234k
          if (!cell.i32 || 
cell.i32[0] == 031.0k
)
2112
203k
            ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2113
31.0k
          else {
2114
            /* Otherwise, set to the longest one */
2115
31.0k
            int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
2116
31.0k
            ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2117
31.0k
          }
2118
234k
        }
2119
22.7k
      }
2120
28.1k
    }
2121
25.3k
  } ccv_nnc_graph_visit_endfor
2122
6.27k
#undef for_block
2123
6.27k
  ccfree(buf);
2124
  // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2125
6.27k
  const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2126
6.27k
  ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2127
  // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2128
  // happens that I have to loop through all relevant node to find out if one is used or not.
2129
98.0k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++91.7k
)
2130
91.7k
    tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2131
32.3k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2132
123k
    for (i = 0; i < node->input_size; 
i++91.1k
)
2133
91.1k
      if (node->inputs[i] >= 0)
2134
64.5k
      {
2135
64.5k
        tensor_blocks[node->inputs[i]].flags = 0;
2136
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2137
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2138
64.5k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
2139
64.5k
          
(56.6k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD56.6k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD56.5k
))
2140
21
          tensor_blocks[node->inputs[i]].pin_mem = 1;
2141
64.5k
      }
2142
82.8k
    for (i = 0; i < node->output_size; 
i++50.5k
)
2143
50.5k
      if (node->outputs[i] >= 0)
2144
41.5k
      {
2145
41.5k
        tensor_blocks[node->outputs[i]].flags = 0;
2146
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2147
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2148
41.5k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
2149
41.5k
          
(36.1k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD36.1k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD36.1k
))
2150
18
          tensor_blocks[node->outputs[i]].pin_mem = 1;
2151
41.5k
      }
2152
32.3k
  } ccv_nnc_graph_visit_endfor
2153
6.27k
  if (p_node_info)
2154
62
  {
2155
62
    assert(p_tensor_symbol_info);
2156
    // Mark it as used if it is used in either input or output.
2157
165
    
for (i = 0; 62
i < p_node_info->input_size;
i++103
)
2158
103
      if (p_node_info->inputs[i] >= 0)
2159
103
      {
2160
103
        const int d = p_node_info->inputs[i];
2161
103
        if (p_tensor_symbol_info[d].s_ref && 
p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx101
)
2162
92
        {
2163
92
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
2164
92
          if (dd >= 0) // If this exists in this sub-graph, great.
2165
80
            tensor_blocks[dd].flags = 0;
2166
92
        }
2167
103
      }
2168
132
    for (i = 0; i < p_node_info->output_size; 
i++70
)
2169
70
      if (p_node_info->outputs[i] >= 0)
2170
70
      {
2171
70
        const int d = p_node_info->outputs[i];
2172
70
        if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2173
70
        {
2174
70
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
2175
70
          if (dd >= 0) // If this exists in this sub-graph, great.
2176
70
            tensor_blocks[dd].flags = 0;
2177
70
        }
2178
70
      }
2179
62
  }
2180
98.0k
  
for (i = 0; 6.27k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.7k
)
2181
91.7k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
2182
70.9k
    {
2183
      // Check no tensor info is auto now.
2184
70.9k
      assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info));
2185
      // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2186
      // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2187
      // fold to).
2188
70.9k
      if (tensor_symbol_info[i].assign_ref)
2189
40
      {
2190
        // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2191
        // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2192
        // it kept its own representation, which is not the case for output).
2193
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
2194
40
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2195
        // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2196
40
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref]);
2197
        // It also cannot be folded as output (except i), because we need to keep its own representation.
2198
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref]);
2199
40
        assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0);
2200
40
        tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2201
63
        for (j = 0; j < unroll_count; 
j++23
)
2202
23
        {
2203
23
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
2204
23
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
2205
23
        }
2206
40
        if (tensor_blocks[assign_ref].bypass_ref)
2207
4
        {
2208
          // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2209
4
          tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2210
4
          const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2211
4
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref]);
2212
4
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref]);
2213
          // On the other hand, it can be folded into the except_ref for the bypass_ref.
2214
4
          tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2215
4
          if (dup_tensor_from_ref)
2216
2
          {
2217
2
            const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2218
2
            if (bypass_from_ref >= 0)
2219
2
            {
2220
2
              TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref]);
2221
2
              TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref]);
2222
2
              assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref);
2223
2
              for (j = 0; j < unroll_count - 1; 
j++0
)
2224
0
              {
2225
                // Mark every incarnation as unfold-able.
2226
0
                TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
2227
0
                TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
2228
0
              }
2229
2
            }
2230
2
          }
2231
4
        }
2232
40
      }
2233
70.9k
    }
2234
98.0k
  
for (i = 0; 6.27k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.7k
)
2235
91.7k
  {
2236
    // If it has a pair reference, we don't need to allocate this tensor at all,
2237
    // set it to be unassigned.
2238
91.7k
    if (tensor_symbol_info[i].pair_ref)
2239
15
      TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i]);
2240
    // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2241
91.7k
    else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2242
7
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2243
7
      TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
2244
      // For this case, there is no exception.
2245
7
      tensor_blocks[i].unfoldable_except_ref = 0;
2246
91.7k
    } else if (tensor_symbol_info[i].p_ref) {
2247
119
      assert(p_node_info);
2248
119
      const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2249
      // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2250
119
      if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2251
        // TODO: This check can be lifted if we can fold in the parent graph.
2252
48
        if (-1 == p_ref_is_in_or_out)
2253
20
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2254
119
      if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2255
68
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2256
119
    }
2257
91.7k
  }
2258
98.0k
  
for (i = 0; 6.27k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.7k
)
2259
91.7k
  {
2260
91.7k
    if (tensor_symbol_info[i].alias_ref)
2261
3.26k
    {
2262
3.26k
      const int ref = tensor_symbol_info[i].alias_ref - 1;
2263
      // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2264
3.26k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
!1.58k
TENSOR_EXPECT_UNASSIGNED1.58k
(tensor_blocks[i]))
2265
1.04k
        tensor_blocks[ref].flags = 0;
2266
      // An alias cannot ref to another alias.
2267
3.26k
      assert(!tensor_symbol_info[ref].alias_ref);
2268
3.26k
      tensor_blocks[i].flags = ALIAS;
2269
3.26k
      tensor_blocks[i].ref = ref + 1; // Assign the ref.
2270
3.26k
      if (!tensor_blocks[ref].r_refs)
2271
3.22k
        tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2272
3.26k
      ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2273
3.26k
    }
2274
91.7k
  }
2275
  // Scan again and if the ref is not assigned, mark the alias not assigned.
2276
98.0k
  
for (i = 0; 6.27k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.7k
)
2277
91.7k
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
2278
3.26k
    {
2279
3.26k
      const int ref = tensor_blocks[i].ref - 1;
2280
3.26k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]))
2281
539
      {
2282
        // Mark this as unassigned.
2283
539
        tensor_blocks[i].flags = UNASSIGNED;
2284
539
        tensor_blocks[i].ref = 0;
2285
539
      }
2286
3.26k
    }
2287
98.0k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++91.7k
)
2288
91.7k
  {
2289
    // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2290
91.7k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
2291
69.2k
    {
2292
69.2k
      tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2293
69.2k
      tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2294
      // Cache tensor size (align to 16 bytes).
2295
69.2k
      tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2296
69.2k
    }
2297
    // If there is a p_ref, add the one to the p_refs list.
2298
91.7k
    if (tensor_symbol_info[i].p_ref)
2299
128
      tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2300
91.7k
  }
2301
32.3k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2302
123k
    for (i = 0; i < node->input_size; 
i++91.1k
)
2303
91.1k
    {
2304
91.1k
      int d = node->inputs[i];
2305
91.1k
      if (d < 0)
2306
26.5k
        continue;
2307
64.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2308
1.57k
        d = tensor_symbol_info[d].alias_ref - 1;
2309
64.5k
      tensor_blocks[d].flags |= READ_ONLY;
2310
64.5k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2311
15
        continue;
2312
64.5k
      assert
(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))64.5k
;
2313
      /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2314
       * from the very beginning of the graph life-cycle and ends here. */
2315
64.5k
      if (tensor_blocks[d].head->rnum == 0 && 
!27.6k
TENSOR_REQUIRE_INIT27.6k
(tensor_symbol_info[d].flags))
2316
27.5k
      {
2317
87.4k
        for (j = 0; j < source_size; 
j++59.8k
)
2318
59.8k
        {
2319
          // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2320
59.8k
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2321
59.8k
          if (cell.i32 && 
cell.i32[0] > 022.7k
)
2322
22.7k
            _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2323
59.8k
        }
2324
        /* If this is a read-only (based on SSA, if first encountered as read), and this is
2325
         * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2326
         * loop, however, in that case, you need to prevent read-only gets reused for the
2327
         * output tensor, which is not obvious how to implement correctly), and it is not
2328
         * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2329
         * of memory anyway (because on second loop, we want to read the same value out).
2330
         * Mark it to the end of the graph. */
2331
27.5k
        if (p_node_info && 
!tensor_symbol_info[d].assign_ref146
)
2332
210
          
for (j = 0; 105
j < destination_size;
j++105
)
2333
105
          {
2334
            // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2335
105
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2336
105
            if (cell.i32 && 
cell.i32[0] > 065
)
2337
65
              _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2338
105
          }
2339
27.5k
      }
2340
64.5k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2341
64.5k
    }
2342
82.8k
    
for (i = 0; 32.3k
i < node->output_size;
i++50.5k
)
2343
50.5k
    {
2344
50.5k
      int d = node->outputs[i];
2345
50.5k
      if (d < 0)
2346
8.91k
        continue;
2347
41.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2348
1.36k
        d = tensor_symbol_info[d].alias_ref - 1;
2349
41.5k
      tensor_blocks[d].flags |= WRITE_ONLY;
2350
41.5k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2351
0
        continue;
2352
41.5k
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2353
41.5k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2354
41.5k
    }
2355
32.3k
  } ccv_nnc_graph_visit_endfor
2356
  // For any assign_ref, its life-time kept until the end and wrap over.
2357
98.0k
  
for (i = 0; 6.27k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.7k
)
2358
    // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2359
    // that "somewhere else" need to keep its life-time til the end.
2360
91.7k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) &&
2361
91.7k
      
p_node_info69.2k
&&
tensor_symbol_info[i].assign_ref282
)
2362
42
    {
2363
42
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2364
84
      for (j = 0; j < destination_size; 
j++42
)
2365
42
      {
2366
        // This logic is to be more conservative about which destination we add to.
2367
        // As of now, if we add everything, it is fine most likely. However, it may
2368
        // cause issues in the future to do so naively. Thus, instead, we only add
2369
        // the destination to it iff either the tensor is not used at all, or, the
2370
        // destination is on the same stream as of the tensor block some way.
2371
42
        int flag = !tensor_blocks[assign_ref].tail;
2372
83
        for (k = 0; !flag && 
k < tensor_blocks[assign_ref].tail->rnum73
;
k++41
)
2373
41
        {
2374
41
          const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k);
2375
41
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2376
41
          flag = (cell.i32 && 
cell.i32[0] > 010
);
2377
41
        }
2378
42
        if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2379
10
          _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2380
42
      }
2381
42
    }
2382
6.38k
  for (i = 0; i < output_size; 
i++107
)
2383
107
  {
2384
107
    assert(outputs[i].graph == symbolic_graph);
2385
107
    int d = outputs[i].d;
2386
107
    if (d < 0)
2387
0
      continue;
2388
107
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2389
0
      d = tensor_symbol_info[d].alias_ref - 1;
2390
107
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2391
0
      continue;
2392
107
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2393
377
    
for (j = 0; 107
j < destination_size;
j++270
)
2394
270
    {
2395
270
      int flag = !tensor_blocks[d].tail;
2396
540
      for (k = 0; !flag && 
k < tensor_blocks[d].tail->rnum505
;
k++270
)
2397
270
      {
2398
270
        const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
2399
270
        const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2400
270
        flag = (cell.i32 && 
cell.i32[0] > 035
);
2401
270
      }
2402
270
      if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2403
35
        _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2404
270
    }
2405
107
  }
2406
  // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2407
32.3k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2408
32.3k
    int x, y;
2409
123k
    for (x = 0; x < node->input_size; 
x++91.1k
)
2410
261k
      
for (y = 0; 91.1k
y < node->output_size;
y++169k
)
2411
        /* Some operations enforces some tensors to be the same for inputs / outputs. */
2412
169k
        if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2413
180
        {
2414
          // If both unassigned, it is fine.
2415
180
          if (node->inputs[x] < 0 && 
node->outputs[y] < 00
)
2416
0
            continue;
2417
180
          int ref = node->inputs[x];
2418
180
          assert(ref >= 0);
2419
180
          while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref0
)
2420
0
            ref = tensor_blocks[ref].ref - 1;
2421
180
          const int node_output_y = node->outputs[y];
2422
180
          assert(node_output_y >= 0);
2423
          // If both are not computable, it is fine, we don't need to enforce.
2424
180
          if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2425
180
            
!0
TENSOR_EXPECT_COMPUTABLE0
(tensor_blocks[node_output_y]))
2426
0
            continue;
2427
          // Otherwise, enforce and error out if failed.
2428
180
          if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2429
0
            { assert(0 && "cannot enforce inplace for the two tensors"); }
2430
180
        }
2431
32.3k
  } ccv_nnc_graph_visit_endfor
2432
  // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2433
  // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2434
  // that is not enforced in-place (because the tensor enforced in-place will be different than the
2435
  // binding one).
2436
53.9k
  
for (i = 0; 6.27k
i < tensor_bind_size;
i++47.6k
)
2437
47.6k
  {
2438
47.6k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2439
    // If there is a tensor binded, then it is unassigned.
2440
47.6k
    if (resolved_symbol.d >= 0)
2441
47.6k
    {
2442
47.6k
      int d = resolved_symbol.d;
2443
      // I cannot assert too much at this moment.
2444
47.6k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2445
1.02k
        d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2446
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
2447
      // It has nothing to do with alias.
2448
47.8k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref12.5k
)
2449
146
        d = tensor_blocks[d].ref - 1;
2450
      // Doesn't work if this is a loop carrying variable.
2451
47.6k
      assert(!tensor_symbol_info[d].assign_ref);
2452
47.6k
      tensor_blocks[d].flags = UNASSIGNED;
2453
47.6k
      tensor_blocks[d].ref = 0; // No need to have ref as well.
2454
47.6k
    }
2455
47.6k
  }
2456
  // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2457
32.3k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2458
32.3k
    int x, y;
2459
123k
    for (x = 0; x < node->input_size; 
x++91.1k
)
2460
91.1k
    {
2461
      /* If the input is not assigned, it can be referenced, find the referenced one */
2462
91.1k
      int ref = node->inputs[x];
2463
91.1k
      if (ref < 0)
2464
26.5k
        continue;
2465
64.5k
      const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2466
71.9k
      while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref38.4k
)
2467
7.32k
        ref = tensor_blocks[ref].ref - 1;
2468
64.5k
      assert(tensor_blocks[ref].ref == 0);
2469
64.5k
      if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2470
64.5k
        
tensor_blocks[ref].tail->rnum == 133.4k
)
2471
33.2k
      {
2472
87.1k
        for (y = 0; y < node->output_size; 
y++53.8k
)
2473
          /* Only proceed if the input symbol is different from the output symbol, */
2474
          /* and the input symbol meets the output symbol exactly at the same spot. */
2475
53.8k
          if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2476
53.8k
            
node->outputs[y] >= 013.4k
&&
2477
53.8k
            
ref != node->outputs[y]13.4k
&&
2478
53.8k
            
TENSOR_EXPECT_COMPUTABLE13.4k
(tensor_blocks[node->outputs[y]]))
2479
6.59k
          {
2480
6.59k
            const int node_output_y = node->outputs[y];
2481
6.59k
            const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2482
            /* If dimension matches perfectly, then we can assign y_symbol to x.
2483
             * If both of them are aliases, making sure their origin matches in size too. */
2484
6.59k
            if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
2485
6.57k
            {
2486
6.57k
              _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2487
              // This refers to an alias itself, now mark it and will be processed later.
2488
6.57k
              if (ref != node->inputs[x])
2489
308
                tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2490
6.57k
            }
2491
6.59k
          }
2492
33.2k
      }
2493
64.5k
    }
2494
32.3k
  } ccv_nnc_graph_visit_endfor
2495
  // Specifically handle the bypass. This need to be done after the first pass.
2496
  // I need to extend the bypass life-time to the same as the one I am going with.
2497
  // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2498
6.27k
  ccv_nnc_tensor_block_t empty_block = {};
2499
6.27k
  empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2500
6.27k
  empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2501
32.3k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2502
32.3k
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2503
13
    {
2504
13
      int can_bypass = 1;
2505
28
      for (i = 0; can_bypass && 
i < node->output_size25
;
i++15
)
2506
15
      {
2507
15
        int d = node->outputs[i];
2508
15
        if (d < 0)
2509
0
          continue;
2510
15
        if (!tensor_blocks[d].bypass_ref)
2511
2
          continue;
2512
13
        while (tensor_blocks[d].ref)
2513
0
          d = tensor_blocks[d].ref - 1;
2514
13
        int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2515
14
        while (tensor_blocks[bypass_ref].ref)
2516
1
          bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2517
        // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2518
13
        if (!tensor_symbol_info[bypass_ref].assign_ref && 
!tensor_symbol_info[bypass_ref].r_assign_ref10
)
2519
10
          continue;
2520
3
        ccv_array_clear(empty_block.head);
2521
6
        for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; 
j++3
)
2522
3
          ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j));
2523
3
        ccv_array_clear(empty_block.tail);
2524
6
        for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; 
j++3
)
2525
3
          ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j));
2526
6
        for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++3
)
2527
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), empty_block);
2528
6
        for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++3
)
2529
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), empty_block);
2530
        // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2531
3
        assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref));
2532
3
        int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : 
tensor_symbol_info[bypass_ref].r_assign_ref - 10
;
2533
3
        while (tensor_blocks[b_ref].ref)
2534
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2535
3
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2536
3
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2537
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2538
        // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2539
3
        can_bypass = can_bypass && (a_hop_b || b_hop_a);
2540
3
      }
2541
13
      if (can_bypass)
2542
10
      {
2543
22
        for (i = 0; i < node->output_size; 
i++12
)
2544
12
        {
2545
12
          int d = node->outputs[i];
2546
12
          if (d < 0)
2547
0
            continue;
2548
12
          if (!tensor_blocks[d].bypass_ref)
2549
2
            continue;
2550
10
          while (tensor_blocks[d].ref)
2551
0
            d = tensor_blocks[d].ref - 1;
2552
10
          int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2553
10
          while (tensor_blocks[bypass_ref].ref)
2554
0
            bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2555
          // The bypass_ref can extend its life-time.
2556
20
          for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++10
)
2557
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), tensor_blocks[bypass_ref]);
2558
20
          for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++10
)
2559
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), tensor_blocks[bypass_ref]);
2560
10
        }
2561
10
      } else {
2562
6
        for (i = 0; i < node->output_size; 
i++3
)
2563
3
          tensor_blocks[node->outputs[i]].bypass_ref = 0;
2564
3
        const int exec_idx = (dup_exec_from_ref) ? 
dup_exec_from_ref[idx]1
:
idx2
;
2565
        // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2566
3
        exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2567
3
      }
2568
13
    }
2569
32.3k
  } ccv_nnc_graph_visit_endfor
2570
6.27k
  ccv_array_free(empty_block.head);
2571
6.27k
  ccv_array_free(empty_block.tail);
2572
6.27k
  *r_exec_dep = exec_dep;
2573
6.27k
  *r_tensor_blocks = tensor_blocks;
2574
6.27k
}
2575
2576
static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2577
33
{
2578
33
  if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
cmd.cmd == CCV_NNC_GRAPH_BACKWARD30
)
2579
3
  {
2580
3
    ccv_nnc_cmd_t retval = cmd;
2581
3
    retval.cmd = CCV_NNC_NOOP;
2582
3
    return retval;
2583
3
  }
2584
30
  return cmd;
2585
33
}
2586
2587
static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2588
102
{
2589
102
  if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2590
47
  {
2591
47
    if (tensor_symbol_info[input].alias_ref)
2592
18
    {
2593
18
      const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2594
18
      assert(tensor_symbol_info[alias_ref].alias_ref == 0);
2595
18
      ccv_nnc_tensor_symbol_t tensor_symbol = {};
2596
18
      if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2597
6
      {
2598
6
        tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2599
6
        if (tensor_symbol_info[alias_ref].pair_ref)
2600
0
          ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2601
0
            .d = tensor_symbol_info[alias_ref].pair_ref - 1,
2602
0
            .graph = dup_graph->pair
2603
0
          });
2604
6
        ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2605
6
        dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2606
12
      } else {
2607
12
        tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2608
12
        tensor_symbol.graph = dup_graph;
2609
12
      }
2610
18
      ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2611
18
      if (tensor_symbol_info[input].pair_ref)
2612
0
        ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2613
0
          .d = tensor_symbol_info[input].pair_ref - 1,
2614
0
          .graph = dup_graph->pair
2615
0
        });
2616
18
      ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2617
18
      dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2618
29
    } else {
2619
29
      ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2620
29
      if (tensor_symbol_info[input].pair_ref)
2621
4
        ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2622
4
          .d = tensor_symbol_info[input].pair_ref - 1,
2623
4
          .graph = dup_graph->pair
2624
4
        });
2625
29
      ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2626
29
      dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2627
29
    }
2628
47
    if (tensor_symbol_info[input].bypass_ref)
2629
2
    {
2630
2
      const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2631
2
      assert(dup_bypass_ref >= 0);
2632
2
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count]);
2633
2
      symbol_info->bypass_ref = dup_bypass_ref + 1;
2634
2
    }
2635
47
  }
2636
102
  return (ccv_nnc_tensor_symbol_t) {
2637
102
    .d = dup_tensor_block_ref[input * unroll_count],
2638
102
    .graph = dup_graph,
2639
102
  };
2640
102
}
2641
2642
static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2643
72
{
2644
72
  int i;
2645
72
  if (dup_exec_ref[idx * unroll_count] < 0)
2646
44
  {
2647
    // Input has to come before output, because output could has a bypass reference to the input.
2648
116
    for (i = 0; i < node->input_size; 
i++72
)
2649
72
      max_inputs[i] = (node->inputs[i] >= 0) ? 
_ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i])71
:
(ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph }1
;
2650
75
    for (i = 0; i < node->output_size; 
i++31
)
2651
31
      max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : 
(ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph }0
;
2652
44
    ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2653
44
    dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2654
44
  }
2655
72
  return (ccv_nnc_graph_exec_symbol_t) {
2656
72
    .d = dup_exec_ref[idx * unroll_count],
2657
72
    .graph = dup_graph,
2658
72
  };
2659
72
}
2660
2661
static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2662
6.27k
{
2663
6.27k
  int i;
2664
98.1k
  for (i = 0; i < tensor_block_size; 
i++91.8k
)
2665
91.8k
  {
2666
91.8k
    if (tensor_blocks[i].head)
2667
62.8k
      ccv_array_free(tensor_blocks[i].head);
2668
91.8k
    if (tensor_blocks[i].tail)
2669
62.8k
      ccv_array_free(tensor_blocks[i].tail);
2670
91.8k
    if (tensor_blocks[i].r_refs)
2671
9.45k
      ccv_array_free(tensor_blocks[i].r_refs);
2672
91.8k
    if (tensor_blocks[i].dup_p_refs)
2673
22
      ccv_array_free(tensor_blocks[i].dup_p_refs);
2674
91.8k
  }
2675
6.27k
  ccfree(tensor_blocks);
2676
6.27k
}
2677
2678
// Find tensors that cannot be solved by co-allocating to the same location.
2679
static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2680
21
{
2681
21
  int i, j, unroll_count = 0;
2682
131
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++110
)
2683
110
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_symbol_info[i].assign_ref90
)
2684
25
    {
2685
      // This is is a parameter, thus, it has to be either an alias or used.
2686
25
      assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]));
2687
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2688
      // The parameter it assign to has to be either an alias or used.
2689
25
      assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]));
2690
      // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2691
      // If it is the same, we are good, no need to extend.
2692
25
      int a_ref = i;
2693
25
      while (tensor_blocks[a_ref].ref)
2694
0
        a_ref = tensor_blocks[a_ref].ref - 1;
2695
25
      int b_ref = assign_ref;
2696
31
      while (tensor_blocks[b_ref].ref)
2697
6
        b_ref = tensor_blocks[b_ref].ref - 1;
2698
25
      if (a_ref != b_ref)
2699
19
      {
2700
        // If any of the b's head is deterministically later than a's tail
2701
        // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2702
19
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2703
19
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2704
        // It cannot be that both i can hop to j can j can hop to i.
2705
19
        assert(!(a_hop_b > 0 && b_hop_a > 0));
2706
        // Can it be folded
2707
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2708
19
        if (a_hop_b || 
b_hop_a16
)
2709
3
        {
2710
3
          tensor_blocks[a_ref].companion_ref = b_ref + 1;
2711
3
          tensor_blocks[b_ref].companion_ref = a_ref + 1;
2712
3
          continue;
2713
3
        }
2714
16
        int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2715
20
        for (j = 0; c_ref >= 0; 
j++4
)
2716
4
        {
2717
4
          while (tensor_blocks[c_ref].ref)
2718
0
            c_ref = tensor_blocks[c_ref].ref - 1;
2719
4
          c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2720
4
        }
2721
16
        unroll_count = ccv_max(unroll_count, j + 1);
2722
16
      }
2723
25
    }
2724
  // Reset companion_ref if need to unroll.
2725
21
  if (unroll_count)
2726
91
    
for (j = 0; 13
j < symbolic_graph->tensor_symbol_info->rnum;
j++78
)
2727
78
      tensor_blocks[j].companion_ref = 0;
2728
21
  return unroll_count;
2729
21
}
2730
2731
static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2732
13
{
2733
13
  int i, j, n;
2734
  // The inout exec nodes, these are the nodes we are going to extend.
2735
13
  uint8_t* inout = (uint8_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2736
13
  int max_input_size = 0;
2737
13
  int max_output_size = 0;
2738
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2739
35
  {
2740
35
    max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size);
2741
35
    max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size);
2742
35
  }
2743
13
  ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)];
2744
13
  ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)];
2745
  // Doing graph expansion
2746
  // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2747
13
  assert(dup_graph->exec_symbol_info->rnum > 0);
2748
13
  assert(dup_graph->tensor_symbol_info->rnum > 0);
2749
88
#define INCOMING_NODE (1)
2750
28
#define OUTGOING_NODE (2)
2751
  // Unroll the graph n times.
2752
29
  
for (n = 0; 13
n < unroll_count;
n++16
)
2753
16
  {
2754
16
    int* const dup_exec_ref = r_dup_exec_ref + n;
2755
16
    const int* const prev_dup_tensor_block_ref = n > 0 ? 
r_dup_tensor_block_ref + (n - 1)3
:
013
;
2756
16
    int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2757
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2758
46
      dup_exec_ref[i * unroll_count] = -1;
2759
131
    for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++115
)
2760
115
    {
2761
      // If there is a assign_ref, that means I don't need to dup the tensor.
2762
115
      if (tensor_symbol_info[i].assign_ref)
2763
25
      {
2764
25
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2765
25
        dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? 
prev_dup_tensor_block_ref[assign_ref * unroll_count]8
:
assign_ref17
;
2766
90
      } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && 
TENSOR_READ_WRITE52
(tensor_blocks[i]) == READ_ONLY52
)
2767
      // If this is a read-only tensor block, no need to duplicate because the value never changes
2768
      // (note we handled assign_ref first), therefore, no need to generate duplicate.
2769
26
        dup_tensor_block_ref[i * unroll_count] = i;
2770
64
      else
2771
64
        dup_tensor_block_ref[i * unroll_count] = -1;
2772
115
    }
2773
    // Go through the original graph, make copies of the node if it is inout.
2774
44
    ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2775
44
      ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2776
44
      inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2777
44
      if (!node->outgoings)
2778
16
        continue;
2779
56
      
for (i = 0; 28
i < node->outgoings->rnum;
i++28
)
2780
28
      {
2781
28
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
2782
28
        inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2783
28
        ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2784
28
        ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2785
28
      }
2786
28
    } ccv_nnc_graph_visit_endfor
2787
    // Check the visitor are all marked as either incoming or outgoing.
2788
16
    const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2789
16
    const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2790
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2791
46
    {
2792
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2793
2
        continue;
2794
46
      assert
((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))44
;
2795
      // If this is pure incoming nodes, then I need to concat this one with all original destination node
2796
44
      if (inout[i] == INCOMING_NODE)
2797
32
        
for (j = 0; 16
j < dup_destination_size;
j++16
)
2798
16
        {
2799
16
          ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2800
16
            .d = dup_destinations[j].d,
2801
16
            .graph = dup_graph,
2802
16
          }, (ccv_nnc_graph_exec_symbol_t) {
2803
16
            .d = dup_exec_ref[i * unroll_count],
2804
16
            .graph = dup_graph,
2805
16
          });
2806
16
        }
2807
44
    }
2808
16
    if (dup_graph->destinations)
2809
16
      ccv_array_clear(dup_graph->destinations);
2810
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2811
46
    {
2812
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2813
2
        continue;
2814
44
      const int d = dup_exec_ref[i * unroll_count];
2815
44
      ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d);
2816
      // If this has no outgoing node, add to the destination.
2817
44
      if (!exec_symbol_info->outgoings || 
exec_symbol_info->outgoings->rnum == 028
)
2818
16
        ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2819
16
          .graph = dup_graph,
2820
16
          .d = d,
2821
16
        });
2822
44
    }
2823
16
  }
2824
13
#undef INCOMING_NODE
2825
13
#undef OUTGOING_NODE
2826
13
  ccfree(inout);
2827
13
}
2828
2829
static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2830
13
{
2831
13
  int i;
2832
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2833
    // Now can assign them (The dup) as companion.
2834
    // Get to the last one, which we will wrap over.
2835
78
    if (dup_tensor_symbol_info[i].assign_ref)
2836
17
    {
2837
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2838
17
      dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2839
17
      assert(dup_tensor_symbol_info[i].assign_ref);
2840
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2841
17
    }
2842
13
}
2843
2844
// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2845
// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2846
// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2847
static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2848
21
{
2849
21
  int i, j, k;
2850
45
  for (i = 0; i < p_node_info->output_size; 
i++24
)
2851
24
  {
2852
24
    const int d = p_node_info->outputs[i];
2853
24
    const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx) - 1;
2854
24
    if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref]))
2855
6
      continue;
2856
36
    
for (k = 0; 18
k < destination_size;
k++18
)
2857
18
      _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2858
    // Add the duplicated destinations to the tensor_block_ref.
2859
42
    for (j = 0; j < unroll_count; 
j++24
)
2860
48
      
for (k = 0; 24
k < destination_size;
k++24
)
2861
24
      {
2862
24
        const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2863
24
        const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2864
24
        if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2865
24
          _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2866
24
      }
2867
18
  }
2868
21
}
2869
2870
static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2871
21
{
2872
21
  int i, j;
2873
21
  ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2874
21
  ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2875
  // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2876
  // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2877
  // No need to change anything, we are good.
2878
21
  const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2879
21
  if (!unroll_count)
2880
8
    return;
2881
  // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2882
  // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2883
13
  ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2884
13
  int* dup_exec_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2885
13
  int* dup_tensor_block_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2886
13
  _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2887
13
  ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2888
13
  ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2889
26
  ccv_nnc_graph_visit_t* dup_visit = 
ccv_nnc_graph_visit_new13
(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0);
2890
13
  ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2891
26
  _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2892
  // Free out the old exec_dep
2893
26
  ccv_matrix_free(exec_dep);
2894
  // and the tensor blocks, prepare for the new.
2895
26
  _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2896
  // A reverse map to find where the original tensor comes from.
2897
26
  int* dup_tensor_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2898
142
  for (i = 0; i < dup_graph->tensor_symbol_info->rnum; 
i++129
)
2899
129
    dup_tensor_from_ref[i] = -1;
2900
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2901
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2902
115
      if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2903
104
        dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2904
26
  int* dup_exec_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2905
90
  for (i = 0; i < dup_graph->exec_symbol_info->rnum; 
i++77
)
2906
77
    dup_exec_from_ref[i] = -1;
2907
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2908
35
  {
2909
35
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2910
2
      continue;
2911
33
    dup_exec_from_ref[i] = i; // Reference back.
2912
77
    for (j = 0; j < unroll_count; 
j++44
)
2913
44
      if (dup_exec_ref[i * unroll_count + j] >= 0)
2914
44
        dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2915
33
  }
2916
  // Reset all attr.
2917
26
  memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2918
26
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2919
26
  ccv_nnc_graph_visit_free(dup_visit);
2920
26
  
ccfree13
(dup_exec_symbol_info);
2921
26
  
ccfree13
(dup_exec_from_ref);
2922
26
  
ccfree13
(dup_tensor_from_ref);
2923
  // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2924
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2925
    // Loop over all possible duplications to assign dup_p_ref properly.
2926
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2927
115
    {
2928
115
      const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2929
115
      if (dup_idx >= 0 && 
(104
tensor_blocks[i].p_refs[0]104
||
tensor_blocks[i].p_refs[1]60
))
2930
44
      {
2931
44
        const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2932
44
        const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2933
44
        if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2934
28
        {
2935
28
          if (!tensor_blocks[dup_idx].dup_p_refs)
2936
22
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2937
28
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2938
28
        }
2939
44
        if (p_ref_0_is_in_or_out == 1 || 
tensor_blocks[i].p_refs[1] == 016
)
2940
44
          continue;
2941
0
        const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2942
0
        const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2943
0
        if (p_ref_1_is_in_or_out == 1)
2944
0
        {
2945
0
          if (!tensor_blocks[dup_idx].dup_p_refs)
2946
0
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2947
0
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2948
0
        }
2949
0
      }
2950
115
    }
2951
  // companion_ref
2952
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2953
    // Now can assign them (The dup) as companion.
2954
78
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
dup_tensor_symbol_info[i].assign_ref71
)
2955
17
    {
2956
      // Get to the last one, which we will wrap over.
2957
17
      const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2958
17
      if (assign_ref >= 0)
2959
17
      {
2960
17
        int b_ref = assign_ref;
2961
17
        while (tensor_blocks[b_ref].ref)
2962
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2963
17
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2964
17
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2965
        // It cannot be that both i can hop to j can j can hop to i.
2966
        // And it can be hop from one to another now after duplication.
2967
17
        assert(a_hop_b > 0 || b_hop_a > 0);
2968
17
        tensor_blocks[i].companion_ref = b_ref + 1;
2969
17
        tensor_blocks[b_ref].companion_ref = i + 1;
2970
17
      }
2971
17
    }
2972
13
  ccfree(dup_tensor_symbol_info);
2973
  // Extend the dup tensor block ref, prepare for future extensions.
2974
13
  dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2975
110
  for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; 
i++97
)
2976
97
    dup_tensor_block_ref[i] = -1;
2977
  // Assign out changed properties.
2978
13
  *r_exec_dep = exec_dep;
2979
13
  *r_tensor_blocks = tensor_blocks;
2980
13
  *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2981
13
  *r_dup_graph = dup_graph;
2982
13
  *r_unroll_count = unroll_count;
2983
13
  *r_dup_exec_ref = dup_exec_ref;
2984
13
  *r_dup_tensor_block_ref = dup_tensor_block_ref;
2985
13
}
2986
2987
static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2988
31
{
2989
31
  if (!anonymous_block_free_list || 
!anonymous_block_free_list_cap15
)
2990
28
    return tensor_block_size;
2991
3
  int i;
2992
3
  const int no_dup_p_refs = (!dup_p_refs || 
!dup_p_refs->rnum0
);
2993
3
  int found_idx = tensor_block_size;
2994
3
  for (i = 0; i < anonymous_block_free_list_cap; 
i++0
)
2995
3
  {
2996
3
    const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i);
2997
3
    assert(idx < tensor_block_size);
2998
    // If the type doesn't match, ignore.
2999
3
    if (tensor_blocks[idx].type != type)
3000
0
      continue;
3001
    // Heuristic about how to select the best tensor block to move forward.
3002
    // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
3003
3
    if (tensor_blocks[idx].size >= size)
3004
3
    {
3005
3
      if (no_dup_p_refs)
3006
3
        return idx;
3007
      // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
3008
      // then we cannot do better than this, if that is the case, just return.
3009
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
3010
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
3011
0
        return idx;
3012
0
    }
3013
0
    int64_t found_idx_size_diff;
3014
0
    int64_t idx_size_diff;
3015
0
    if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
3016
      // Now, compare whether this one or the found_idx one is better.
3017
      // At this point, there is no point of comparing the dup_p_refs, we only care about which one
3018
      // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
3019
0
      (found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3020
0
    {
3021
0
      found_idx = idx;
3022
0
      continue;
3023
0
    }
3024
    // No need to update if found_idx is better than idx.
3025
0
    if (found_idx_size_diff > idx_size_diff)
3026
0
      continue;
3027
    // We bias towards the bigger one in case of similar.
3028
0
    if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3029
0
    {
3030
0
      found_idx = idx;
3031
0
      continue;
3032
0
    }
3033
0
    assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size);
3034
    // On a tie, check which one has tighter life-cycle.
3035
0
    if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3036
0
    {
3037
      // Check whether the current tensor blocks life-cycle is longer than the previous one.
3038
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3039
0
        (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3040
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3041
0
        found_idx = idx;
3042
0
      continue;
3043
0
    }
3044
    // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3045
    // We prefer to choose the one that has life-cycle closer to the expected ones.
3046
0
    if (no_dup_p_refs)
3047
0
    {
3048
      // Whoever is shorter wins.
3049
0
      if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3050
0
        (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3051
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3052
0
        found_idx = idx;
3053
0
      continue;
3054
0
    }
3055
0
    if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3056
0
      continue;
3057
0
    if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3058
0
    {
3059
0
      found_idx = idx;
3060
0
      continue;
3061
0
    }
3062
    // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3063
0
    const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3064
0
    const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3065
0
    if (idx_after_request && found_idx_after_request)
3066
0
    {
3067
0
      if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3068
0
        found_idx = idx;
3069
0
      continue;
3070
0
    } else {
3071
      // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3072
      // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3073
      // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3074
0
      if (!found_idx_after_request && (idx_after_request ||
3075
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3076
0
        found_idx = idx;
3077
0
      continue;
3078
0
    }
3079
0
  }
3080
0
  return found_idx;
3081
3
}
3082
3083
static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3084
49
{
3085
49
  if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3086
28
    return 0;
3087
21
  int i, j, k;
3088
21
  int input_size = 0;
3089
43
  for (i = 0; i < p_node_info->p_while.input_size; 
i++22
)
3090
22
    if (p_node_info->p_while.inputs[i] >= 0)
3091
2
      ++input_size;
3092
  // If doesn't have tensor inputs (thus, only special inputs), just return.
3093
21
  if (!input_size)
3094
19
    return 0;
3095
2
  ccv_nnc_tensor_symbol_t inputs[input_size];
3096
2
  input_size = 0;
3097
6
  for (i = 0; i < p_node_info->p_while.input_size; 
i++4
)
3098
4
    if (p_node_info->p_while.inputs[i] >= 0)
3099
2
      inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3100
2
        .d = p_node_info->p_while.inputs[i],
3101
2
        .graph = symbolic_graph,
3102
2
      };
3103
2
  assert(symbolic_graph->breakpoint_size > 0);
3104
2
  ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3105
2
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3106
4
  for (i = 0; i < symbolic_graph->breakpoint_size; 
i++2
)
3107
2
  {
3108
    // Make a noop copy of the breakpoint, but with some tensor inputs.
3109
2
    ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), inputs, input_size, 0, 0, 0);
3110
2
    ccv_array_push(dup_breakpoints, &noop);
3111
    // Connect this noop to the outgoing nodes of breakpoints.
3112
2
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d);
3113
2
    if (symbol_info->outgoings)
3114
4
      
for (j = 0; 2
j < symbol_info->outgoings->rnum;
j++2
)
3115
2
      {
3116
2
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
3117
2
        ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3118
2
          .d = d,
3119
2
          .graph = symbolic_graph,
3120
2
        });
3121
2
      }
3122
2
  }
3123
7
  for (i = 0; i < exec_symbol_info_size; 
i++5
)
3124
5
  {
3125
5
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
3126
5
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags))
3127
0
      continue;
3128
5
    if (symbol_info->outgoings)
3129
3
    {
3130
3
      const int outgoing_size = symbol_info->outgoings->rnum;
3131
6
      for (j = 0; j < outgoing_size; 
j++3
)
3132
3
      {
3133
3
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
3134
6
        for (k = 0; k < symbolic_graph->breakpoint_size; 
k++3
)
3135
3
          if (d == symbolic_graph->breakpoints[k].d)
3136
0
          {
3137
0
            ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k);
3138
0
            ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3139
0
              .d = i,
3140
0
              .graph = symbolic_graph,
3141
0
            }, noop);
3142
            // Found, connected, exit.
3143
0
            break;
3144
0
          }
3145
3
      }
3146
3
    }
3147
5
  }
3148
  // Add the dup_breakpoints to source if neccessary.
3149
2
  assert(symbolic_graph->sources);
3150
2
  const int source_size = symbolic_graph->sources->rnum;
3151
4
  for (i = 0; i < source_size; 
i++2
)
3152
2
  {
3153
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i))->d;
3154
2
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++0
)
3155
2
      if (d == symbolic_graph->breakpoints[j].d)
3156
2
      {
3157
2
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
3158
2
        ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3159
        // Found, made, exit.
3160
2
        break;
3161
2
      }
3162
2
  }
3163
  // Add the dup_breakpoints to destination if neccessary.
3164
2
  assert(symbolic_graph->destinations);
3165
2
  const int destination_size = symbolic_graph->destinations->rnum;
3166
4
  for (i = 0; i < destination_size; 
i++2
)
3167
2
  {
3168
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i))->d;
3169
4
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++2
)
3170
2
      if (d == symbolic_graph->breakpoints[j].d)
3171
0
      {
3172
0
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
3173
0
        ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3174
        // Found, made, exit.
3175
0
        break;
3176
0
      }
3177
2
  }
3178
2
  return dup_breakpoints;
3179
2
}
3180
3181
// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3182
static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3183
6.26k
{
3184
6.26k
  assert(source_size > 0);
3185
6.26k
  assert(destination_size > 0);
3186
  // First, fill all the "auto" holes.
3187
  // This is the symbol table that with "auto" info filled up.
3188
6.26k
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3189
6.26k
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3190
6.26k
  ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3191
12.5k
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new6.26k
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
3192
0
  ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3193
12.5k
  int i, j, k, p, q;
3194
12.5k
  const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = 
p_exec_symbol_info6.26k
?
p_exec_symbol_info + (symbolic_graph->exec_idx - 1)49
:
06.21k
;
3195
12.5k
  ccv_sparse_matrix_t* exec_dep;
3196
12.5k
  ccv_nnc_tensor_block_t* tensor_blocks;
3197
12.5k
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3198
12.5k
  int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3199
  // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3200
  // are automatically filled in, and all the sub-graphs are processed.
3201
  // There is a last step though, for a while loop, it is parameterized:
3202
  // while (x > 5) {
3203
  //     y = x + 1;
3204
  // } (y => x) // This means after this loop is done, y's value will be copied over to x.
3205
  // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3206
  // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3207
  // it is a inplace operation.
3208
  // But if y cannot be x's alias, for example, this while loop looks like this:
3209
  // while (x > 5) {
3210
  //     y = x + a
3211
  //     b = x + y
3212
  // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3213
  // For this example, y cannot be x's alias because x is used later to compute b (and that computation
3214
  // has dependency on y as well).
3215
  // For this case, we need to modify the computation graph. Previously, the graph looks like this:
3216
  // y = x + a -> b = x + y
3217
  // This graph will be extended to look like this:
3218
  // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3219
  // while (x0 > 5) {
3220
  //     y0 = x0 + a0
3221
  //     b0 = x0 + y0
3222
  //     if (y0 > 5) break
3223
  //     y1 = y0 + b0
3224
  //     b1 = y0 + y1
3225
  // } (y1 => x0, b1 => a0)
3226
  // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3227
  // with each other now).
3228
  // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3229
  // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3230
12.5k
  ccv_nnc_symbolic_graph_t* dup_graph = 0;
3231
12.5k
  int* dup_exec_ref = 0;
3232
12.5k
  int* dup_tensor_block_ref = 0;
3233
12.5k
  int unroll_count = 0;
3234
  // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3235
12.5k
  ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)
ccmalloc6.26k
(sizeof(ccv_nnc_symbolic_graph_prep_t));
3236
12.5k
  prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3237
12.5k
  prep->flags = 0;
3238
  // Cannot handle dup a node that is a graph as well.
3239
12.5k
  if (
p_exec_symbol_info6.26k
)
3240
49
  {
3241
49
    prep->flags = p_node_info->flags;
3242
49
    if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3243
21
    {
3244
21
      _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3245
21
      _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3246
28
    } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3247
      // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3248
28
    }
3249
49
  }
3250
12.5k
  ccv_nnc_symbolic_graph_prep_t** sub_preps = 
symbolic_graph->sub_graphs6.26k
&&
symbolic_graph->sub_graphs->rnum29
?
(ccv_nnc_symbolic_graph_prep_t**)29
cccalloc29
(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) :
06.23k
;
3251
12.5k
  ccv_array_t* anonymous_block_free_list = 0;
3252
12.5k
  const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3253
  // Record whether this tensor is folded in this round.
3254
12.5k
  uint32_t* const tensor_fold = (uint32_t*)
ccmalloc6.26k
(sizeof(uint32_t) * tensor_fold_size);
3255
32.2k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
3256
32.2k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3257
49
    {
3258
49
      assert(symbolic_graph->sub_graphs);
3259
49
      ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1);
3260
49
      ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3261
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3262
49
      sub_prep->dup_breakpoints = dup_breakpoints;
3263
49
      sub_prep->p = prep;
3264
49
      sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1] = sub_prep;
3265
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3266
49
      const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3267
293
      for (i = 0; i < s_alloc_prep->block_size; 
i++244
)
3268
244
      {
3269
244
        const int block_ref = s_alloc_prep->blocks[i].block_ref;
3270
244
        const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3271
244
        if (block_ref < sub_prep->tensor_symbol_info_size)
3272
192
        {
3273
          // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3274
          // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3275
192
          if (s_tensor_blocks[block_ref].bypass_ref)
3276
1
          {
3277
1
            int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3278
1
            while (s_tensor_blocks[bypass_ref].ref)
3279
0
              bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3280
1
            if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3281
1
              
s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1]0
)
3282
1
              continue;
3283
1
          }
3284
191
          if (s_tensor_blocks[block_ref].p_refs[0])
3285
91
          {
3286
            /* If it is already properly assigned, next. */
3287
91
            if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3288
91
              s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3289
91
            {
3290
91
              if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3291
90
                s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3292
1
              else {
3293
1
                assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3294
1
                s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3295
1
              }
3296
91
            }
3297
            /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3298
91
            if (s_tensor_blocks[block_ref].p_refs[1] &&
3299
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1]3
&&
3300
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1]3
)
3301
3
            {
3302
3
              assert(s_alloc_prep->buffers[buffer_ref].p_refs[0]);
3303
3
              assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3304
3
              s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3305
3
            }
3306
91
          }
3307
191
        } else 
if (52
s_tensor_blocks[block_ref].dup_p_refs52
) {
3308
          /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3309
           * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3310
           * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3311
           * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3312
           * its life-time to the end of the output tensor. */
3313
15
          if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3314
13
            s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3315
33
          for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; 
j++18
)
3316
18
            ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j));
3317
15
        }
3318
244
      }
3319
49
    }
3320
32.2k
    const int init_tensor_block_size = tensor_block_size;
3321
32.2k
    int rw_anonymous_buffer_size_cap = 0;
3322
32.2k
    int ro_anonymous_buffer_size_cap = 0;
3323
32.2k
    if (anonymous_block_free_list)
3324
17
      ccv_array_clear(anonymous_block_free_list);
3325
32.2k
    memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3326
32.2k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3327
49
    {
3328
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1];
3329
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3330
49
      int rw_anonymous_buffer_size = 0;
3331
49
      int ro_anonymous_buffer_size = 0;
3332
229
      for (i = 0; i < s_alloc_prep->buffer_size; 
i++180
)
3333
180
        if (s_alloc_prep->buffers[i].p_refs[0])
3334
90
        {
3335
          /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3336
90
          int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3337
          /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3338
90
          int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3339
90
          assert(p_ref_0_is_in_or_out != 0);
3340
90
          int unref_p_ref_0 = p_ref_0;
3341
92
          while (tensor_blocks[unref_p_ref_0].ref)
3342
2
            unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3343
          /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3344
90
          assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3345
90
          if (s_alloc_prep->buffers[i].p_refs[1])
3346
4
          {
3347
4
            int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3348
4
            const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3349
4
            assert(p_ref_1_is_in_or_out != 0);
3350
4
            int unref_p_ref_1 = p_ref_1;
3351
4
            while (tensor_blocks[unref_p_ref_1].ref)
3352
0
              unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3353
            /* See above comment for the similar p_ref_0 check. */
3354
4
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]));
3355
4
            assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out);
3356
4
            int p_ref_t;
3357
4
            if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3358
3
            {
3359
3
              CCV_SWAP(p_ref_0, p_ref_1, p_ref_t);
3360
3
              CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t);
3361
3
            }
3362
4
            p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3363
            /* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3364
4
            if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
3365
4
            {
3366
4
              const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3367
4
              if (folded)
3368
1
              {
3369
1
                p_ref_0 = p_ref_1;
3370
1
                unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3371
1
                tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3372
1
                for (j = 0; j < unroll_count; 
j++0
) /* Fold its duplicates as well. */
3373
0
                {
3374
0
                  const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3375
0
                  assert(folded && "the subsequent duplicates can be folded too.");
3376
0
                }
3377
1
              }
3378
4
            }
3379
4
          }
3380
          /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3381
           * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3382
           * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3383
           * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3384
           * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3385
           * associated with it, then we are good. */
3386
90
          if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3387
90
            
(89
p_ref_0_is_in_or_out == 189
&&
_ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)50
) ||
3388
90
            
(39
p_ref_0_is_in_or_out == -139
&&
_ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)39
) ||
3389
90
            
TENSOR_READ_WRITE8
(s_alloc_prep->buffers[i]) == READ_ONLY8
)
3390
86
          {
3391
86
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3392
27
              { assert(s_alloc_prep->buffers[i].p_refs[1] == 0); }
3393
            /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3394
             * is a long argument why that is the case, the digest is, it is much easier to control your output
3395
             * than your input). */
3396
86
            s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3397
86
            s_alloc_prep->buffers[i].p_refs[1] = 0;
3398
            /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3399
86
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3400
86
            tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size);
3401
95
            for (j = 0; j < unroll_count; 
j++9
) /* Change the size of its duplicates as well. */
3402
9
              tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3403
9
                tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3404
9
                  tensor_blocks[unref_p_ref_0].size;
3405
86
          } else {
3406
4
            s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3407
4
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3408
0
              ++ro_anonymous_buffer_size;
3409
4
            else
3410
4
              rw_anonymous_buffer_size += unroll_count + 1;
3411
4
          }
3412
90
        } else {
3413
90
          if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3414
63
            ++ro_anonymous_buffer_size;
3415
27
          else
3416
27
            rw_anonymous_buffer_size += unroll_count + 1;
3417
90
        }
3418
49
      if (ro_anonymous_buffer_size || 
rw_anonymous_buffer_size24
)
3419
28
      {
3420
28
        const int anonymous_block_free_list_cap = anonymous_block_free_list ? 
anonymous_block_free_list->rnum6
:
022
;
3421
        // All read-write buffer (potentially) can be reused between each case..of branch.
3422
28
        rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3423
        // Read-only buffer cannot be reused between each case..of branch.
3424
28
        ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3425
        /* Anonymous block, allocate additional tensor blocks for this. */
3426
        /* This is either because this is an internal tensor (don't have p_ref) */
3427
        /* or it is an anonymous block itself within the sub graphs of this while graph. */
3428
28
        tensor_blocks = (ccv_nnc_tensor_block_t*)ccrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3429
28
        memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3430
28
        if (dup_tensor_block_ref)
3431
3
          dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3432
174
        for (i = 0; i < s_alloc_prep->buffer_size; 
i++146
)
3433
146
          if (!s_alloc_prep->buffers[i].p_refs[0])
3434
94
          {
3435
94
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3436
63
            {
3437
63
              assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap);
3438
63
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size]);
3439
63
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3440
63
              tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3441
63
              tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3442
63
              tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3443
63
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3444
63
              tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3445
63
              ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3446
63
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3447
63
              if (dup_p_refs && 
dup_p_refs->rnum > 00
)
3448
0
              {
3449
0
                for (j = 0; j < dup_p_refs->rnum; j++)
3450
0
                {
3451
0
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3452
0
                  assert(dup_p_ref >= 0);
3453
0
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3454
0
                  assert(tensor_blocks[dup_p_ref].tail);
3455
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3456
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3457
0
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3458
0
                  {
3459
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3460
0
                    assert(p_node_info);
3461
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3462
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3463
0
                    {
3464
0
                      if (!tensor_blocks[tensor_block_size].dup_p_refs)
3465
0
                        tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3466
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3467
0
                    }
3468
0
                  }
3469
0
                  if (!tensor_blocks[tensor_block_size].tail)
3470
0
                    tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3471
0
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3472
0
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_size]);
3473
0
                }
3474
63
              } else {
3475
63
                tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3476
63
                ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3477
63
              }
3478
132
              
for (j = 0; 63
j < source_size;
j++69
)
3479
69
                _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3480
              /* If this is a read-only (based on SSA, if first encountered as read), and this is
3481
               * sub-graph. Mark it to the end of the graph. */
3482
63
              if (p_exec_symbol_info)
3483
12
                
for (j = 0; 6
j < destination_size;
j++6
)
3484
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3485
              /* If it is read-only, it is self-reflecting. */
3486
69
              for (k = 0; k < unroll_count; 
k++6
)
3487
6
              {
3488
12
                for (j = 0; j < destination_size; 
j++6
)
3489
6
                  if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3490
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3491
                /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3492
6
                assert(symbolic_graph->p);
3493
6
                dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3494
6
              }
3495
63
              ++tensor_block_size;
3496
63
            } else {
3497
31
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3498
31
              const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3499
31
              const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3500
              // Find suitable tensor block from the free list.
3501
31
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3502
31
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3503
31
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3504
31
              if (new_anonymous_tensor_block)
3505
28
              {
3506
28
                tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3507
28
                tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3508
28
                tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3509
28
                tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3510
28
                ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3511
28
              } else {
3512
3
                tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3513
3
                tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3514
3
              }
3515
31
              if (dup_p_refs && 
dup_p_refs->rnum > 05
)
3516
5
              {
3517
10
                for (j = 0; j < dup_p_refs->rnum; 
j++5
)
3518
5
                {
3519
5
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3520
5
                  assert(dup_p_ref >= 0);
3521
5
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3522
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3523
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3524
5
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3525
0
                  {
3526
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3527
0
                    assert(p_node_info);
3528
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3529
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3530
0
                    {
3531
0
                      if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3532
0
                        tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3533
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3534
0
                    }
3535
0
                  }
3536
5
                  assert(tensor_blocks[dup_p_ref].tail);
3537
5
                  if (!tensor_blocks[tensor_block_idx].tail)
3538
5
                    tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3539
10
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; 
k++5
)
3540
5
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_idx]);
3541
                  // We have to add it to the warp around companion_ref as well.
3542
                  // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3543
                  // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3544
                  // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3545
                  // gaurantee may be broken down in the line.
3546
5
                  if (tensor_blocks[dup_p_ref].companion_ref)
3547
0
                  {
3548
0
                    const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3549
0
                    for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3550
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3551
0
                    for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3552
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3553
0
                  }
3554
5
                }
3555
26
              } else if (new_anonymous_tensor_block) {
3556
23
                tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3557
23
                ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3558
23
              }
3559
31
              const int prev_tensor_block_idx = tensor_block_idx;
3560
31
              if (new_anonymous_tensor_block)
3561
28
              {
3562
28
                if (!anonymous_block_free_list)
3563
16
                  anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3564
28
                ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3565
28
                ++tensor_block_size;
3566
28
              }
3567
32
              for (k = 0; k < unroll_count; 
k++1
)
3568
1
              {
3569
1
                const int tensor_block_idx = new_anonymous_tensor_block ?
3570
1
                  (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3571
1
                  
dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k]0
;
3572
1
                TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3573
1
                TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3574
1
                if (new_anonymous_tensor_block)
3575
1
                {
3576
1
                  tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3577
1
                  tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3578
1
                  tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3579
1
                  tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3580
                  /* Attach to duplicated exec for this tensor block. */
3581
1
                  ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3582
1
                } else {
3583
0
                  tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3584
0
                  tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3585
0
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3586
3587
0
                }
3588
1
                if (dup_p_refs && dup_p_refs->rnum > 0)
3589
1
                {
3590
                  /* Not nil, not self-reflecting. */
3591
2
                  for (j = 0; j < dup_p_refs->rnum; 
j++1
)
3592
1
                  {
3593
1
                    const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3594
1
                    assert(dup_p_ref >= 0);
3595
1
                    assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3596
                    // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3597
                    // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3598
1
                    if (tensor_symbol_info[dup_p_ref].p_ref)
3599
0
                    {
3600
0
                      const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3601
0
                      assert(p_node_info);
3602
0
                      const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3603
0
                      if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3604
0
                      {
3605
0
                        if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3606
0
                          tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3607
0
                        ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3608
0
                      }
3609
0
                    }
3610
1
                    assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref);
3611
1
                    const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3612
1
                    assert(tensor_blocks[dup_dup_p_ref].tail);
3613
1
                    if (!tensor_blocks[tensor_block_idx].tail)
3614
1
                      tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3615
2
                    for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; 
q++1
)
3616
1
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q), tensor_blocks[tensor_block_idx]);
3617
                    // We have to add it to the warp around companion_ref as well.
3618
1
                    if (tensor_blocks[dup_dup_p_ref].companion_ref)
3619
0
                    {
3620
0
                      const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3621
0
                      for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3622
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3623
0
                      for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3624
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3625
0
                    }
3626
1
                  }
3627
1
                } else 
if (0
new_anonymous_tensor_block0
) {
3628
0
                  tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3629
0
                  ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3630
0
                }
3631
1
                if (new_anonymous_tensor_block)
3632
1
                  ++tensor_block_size;
3633
1
              }
3634
31
            }
3635
94
          }
3636
28
      }
3637
49
    }
3638
32.2k
  } ccv_nnc_graph_visit_endfor
3639
6.26k
  if (anonymous_block_free_list)
3640
16
    ccv_array_free(anonymous_block_free_list);
3641
6.26k
  ccfree(tensor_fold);
3642
  // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3643
  // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3644
6.26k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3645
6.26k
  prep->while_count_tensor = 0;
3646
6.26k
  prep->dup_breakpoints = 0;
3647
6.26k
  prep->p = 0;
3648
6.26k
  prep->symbolic_graph = symbolic_graph;
3649
6.26k
  prep->p_idx = symbolic_graph->p_idx;
3650
6.26k
  prep->exec_idx = symbolic_graph->exec_idx;
3651
6.26k
  prep->sub_prep_size = symbolic_graph->sub_graphs ? 
symbolic_graph->sub_graphs->rnum29
:
06.23k
;
3652
6.26k
  prep->sub_preps = sub_preps;
3653
6.26k
  prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3654
6.26k
  prep->exec_symbol_info = exec_symbol_info;
3655
6.26k
  prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3656
6.26k
  prep->tensor_symbol_info = tensor_symbol_info;
3657
6.26k
  prep->unroll_count = unroll_count;
3658
6.26k
  prep->dup_tensor_block_ref = dup_tensor_block_ref;
3659
6.26k
  prep->tensor_block_size = tensor_block_size;
3660
6.26k
  prep->tensor_blocks = tensor_blocks;
3661
6.26k
  prep->exec_flags = exec_flags;
3662
6.26k
  prep->visit = visit;
3663
6.26k
  prep->alloc_prep = alloc_prep;
3664
6.26k
  if (dup_graph)
3665
13
    ccv_nnc_symbolic_graph_free(dup_graph);
3666
6.26k
  if (dup_exec_ref)
3667
13
    ccfree(dup_exec_ref);
3668
6.26k
  return prep;
3669
12.5k
}
3670
3671
static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3672
6.26k
{
3673
6.26k
  int i;
3674
6.26k
  _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3675
6.26k
  ccfree(prep->exec_flags);
3676
6.31k
  for (i = 0; i < prep->sub_prep_size; 
i++50
)
3677
50
    if (prep->sub_preps[i])
3678
49
      _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3679
6.26k
  if (prep->sub_preps)
3680
29
    ccfree(prep->sub_preps);
3681
6.26k
  ccfree(prep->tensor_symbol_info);
3682
6.26k
  ccfree(prep->exec_symbol_info);
3683
6.26k
  if (prep->dup_tensor_block_ref)
3684
13
    ccfree(prep->dup_tensor_block_ref);
3685
6.26k
  _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3686
6.26k
  ccv_nnc_graph_visit_free(prep->visit);
3687
6.26k
  ccfree(prep);
3688
6.26k
}
3689
3690
static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3691
6.26k
{
3692
6.26k
  int i, j;
3693
32.2k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
3694
32.2k
    if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3695
21
    {
3696
21
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3697
21
      assert(graph_ref >= 0);
3698
21
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3699
43
      for (i = 0; i < node->p_while.input_size; 
i++22
)
3700
22
        if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i]))
3701
20
        {
3702
20
          ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3703
20
          const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i]);
3704
21
          for (j = 0; j < d; 
j++1
)
3705
1
            prep = prep->p;
3706
20
          prep->while_count_tensor = 1;
3707
20
        }
3708
21
    }
3709
32.2k
    
for (i = 0; 32.2k
i < node->graph_ref_size;
i++49
)
3710
49
    {
3711
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3712
49
      if (graph_ref >= 0)
3713
49
        _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3714
49
    }
3715
32.2k
  } ccv_nnc_graph_visit_endfor
3716
6.26k
}
3717
3718
static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3719
91.0k
{
3720
91.0k
  if (symbol >= 0)
3721
64.4k
    return graph_prep->tensor_arena->vt_tensors[symbol];
3722
26.5k
  if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3723
26.5k
    return 0;
3724
26.5k
  assert
(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))20
;
3725
20
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3726
20
  int i;
3727
20
  const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol);
3728
21
  for (i = 0; i < d; 
i++1
)
3729
1
    prep = prep->p;
3730
20
  assert(prep->while_count_tensor);
3731
20
  return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3732
20
}
3733
3734
static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3735
6.26k
{
3736
6.26k
  int i;
3737
6.26k
  int* const exec_cvt = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
3738
6.26k
  ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3739
6.26k
  graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3740
6.26k
  graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3741
6.26k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3742
58.4k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++52.1k
)
3743
52.1k
    if (graph_execs[i].graph == graph)
3744
32.2k
      graph_execs[i].d = exec_cvt[graph_execs[i].d];
3745
6.26k
  ccfree(exec_cvt);
3746
6.26k
}
3747
3748
static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3749
6.26k
{
3750
6.26k
  int i, j, k;
3751
6.26k
  ccv_nnc_graph_t* const graph = graph_prep->graph;
3752
6.26k
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3753
6.26k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
3754
6.26k
  graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3755
6.26k
  graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3756
6.26k
  graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3757
6.26k
  graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3758
6.26k
  memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3759
6.26k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3760
6.26k
  int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3761
58.4k
  for (i = 0; i < exec_symbol_info_size; 
i++52.1k
)
3762
52.1k
  {
3763
52.1k
    max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size);
3764
52.1k
    max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size);
3765
52.1k
    if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3766
22
      max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size);
3767
52.1k
    graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3768
52.1k
    graph_execs[i].graph = 0;
3769
52.1k
  }
3770
6.31k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
3771
50
    max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size);
3772
6.26k
  ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)];
3773
6.26k
  ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)];
3774
6.26k
  ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)];
3775
6.26k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3776
6.26k
  const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3777
  // Create node, this is in topological order.
3778
32.2k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3779
32.2k
    if (CCV_NO_GRAPH_EXEC(graph_execs[idx]))
3780
32.2k
    {
3781
123k
      for (i = 0; i < node->input_size; 
i++91.0k
)
3782
91.0k
        max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3783
82.6k
      for (i = 0; i < node->output_size; 
i++50.4k
)
3784
50.4k
        max_outputs[i] = node->outputs[i] >= 0 ? 
tensor_arena->vt_tensors[node->outputs[i]]41.5k
:
08.91k
;
3785
32.2k
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3786
21
      {
3787
21
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3788
21
        assert(graph_ref >= 0);
3789
21
        ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3790
21
        ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3791
21
        graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3792
21
        const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3793
21
        ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3794
21
        ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3795
43
        for (i = 0; i < node->p_while.input_size; 
i++22
)
3796
22
          max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3797
42
        for (i = 0; i < sub_symbolic_graph->breakpoint_size; 
i++21
)
3798
21
          max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3799
21
        ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3800
21
        _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3801
32.2k
      } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3802
24
        for (i = 0; i < node->output_size; 
i++13
)
3803
13
          if (max_outputs[i] && max_outputs[i]->alias_ref)
3804
10
            max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3805
11
        graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3806
        // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3807
22
        for (i = 0; i < node->case_of.argument.offset; 
i++11
)
3808
11
        {
3809
11
          ccv_nnc_tensor_t* const update = max_inputs[i];
3810
11
          if (!CCV_IS_TENSOR_MULTIVIEW(update)) // No need if it is a naked tensor.
3811
9
            continue;
3812
2
          int flag = 0;
3813
2
          for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; 
j++0
)
3814
0
            flag = (update == max_inputs[j]);
3815
2
          if (!flag)
3816
2
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3817
2
        }
3818
11
        const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 
11
:
010
;
3819
11
        ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3820
11
        if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3821
1
        {
3822
          // Add another graph for data transfer.
3823
1
          ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3824
2
          for (i = 0; i < node->output_size; 
i++1
)
3825
1
            max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 
00
;
3826
1
          ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size), max_outputs, ccv_min(node->input_size, node->output_size));
3827
1
          ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3828
1
          ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3829
1
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3830
1
          int exec_cvt;
3831
1
          ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3832
1
        }
3833
39
        for (i = 0; i < node->graph_ref_size; 
i++28
)
3834
28
        {
3835
28
          const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3836
28
          if (graph_ref < 0)
3837
0
            continue;
3838
28
          ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3839
28
          const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3840
28
          ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3841
28
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3842
28
          _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3843
28
        }
3844
32.2k
      } else {
3845
32.2k
        graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3846
32.2k
      }
3847
32.2k
      ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3848
32.2k
    }
3849
32.2k
  } ccv_nnc_graph_visit_endfor
3850
  // Then connect them.
3851
32.2k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3852
32.2k
    if (node->outgoings)
3853
53.4k
      
for (i = 0; 25.3k
i < node->outgoings->rnum;
i++28.1k
)
3854
28.1k
      {
3855
28.1k
        const int outgoing = *(int*)ccv_array_get(node->outgoings, i);
3856
28.1k
        if (graph_execs[outgoing].graph)
3857
27.7k
          ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3858
28.1k
      }
3859
32.2k
  } ccv_nnc_graph_visit_endfor
3860
6.26k
  int source_exec_created = 0;
3861
6.26k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3862
6.26k
  const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3863
6.26k
  ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3864
  // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3865
97.8k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++91.6k
)
3866
91.6k
  {
3867
91.6k
    if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags))
3868
138
    {
3869
138
      int ref = i;
3870
138
      while (tensor_symbol_info[ref].alias_ref)
3871
0
        ref = tensor_symbol_info[ref].alias_ref - 1;
3872
138
      while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref42
)
3873
0
        ref = tensor_blocks[ref].ref - 1;
3874
      // This is not computable. It could be that we marked a const tensor as init zero.
3875
138
      if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]))
3876
42
        continue;
3877
      // If this tensor is not used by any exec, we don't need to init at all. Skip.
3878
96
      if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3879
0
        continue;
3880
96
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3881
      // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3882
96
      ccv_nnc_graph_exec_t set_exec;
3883
96
      if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3884
27
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3885
69
      else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3886
69
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3887
192
      for (j = 0; j < tensor_blocks[ref].head->rnum; 
j++96
)
3888
96
      {
3889
96
        const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j);
3890
96
        if (outgoing >= exec_symbol_info_size)
3891
0
          continue;
3892
96
        assert(outgoing >= 0);
3893
96
        assert(graph_execs[outgoing].graph);
3894
96
        ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3895
96
      }
3896
96
      int flags = 0;
3897
96
      if (alloc_dep[ref])
3898
50
        
for (j = 0; 25
j < alloc_dep[ref]->rnum;
j++25
)
3899
25
        {
3900
25
          const int d = *(int*)ccv_array_get(alloc_dep[ref], j);
3901
          // This is from alloc_dep, it should be computable.
3902
25
          assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
3903
25
          if (tensor_blocks[d].tail)
3904
50
            
for (k = 0; 25
k < tensor_blocks[d].tail->rnum;
k++25
)
3905
25
            {
3906
25
              const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
3907
25
              if (incoming >= exec_symbol_info_size)
3908
0
                continue;
3909
25
              assert(incoming >= 0);
3910
25
              assert(graph_execs[incoming].graph);
3911
25
              ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3912
25
              flags = 1;
3913
25
            }
3914
25
        }
3915
      // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3916
96
      if (!flags)
3917
71
      {
3918
71
        if (!source_exec_created)
3919
48
        {
3920
48
          graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3921
48
          source_exec_created = 1;
3922
48
        }
3923
71
        ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3924
71
      }
3925
96
    }
3926
91.6k
  }
3927
  // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3928
  // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3929
  // with its alias).
3930
6.26k
  assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size);
3931
97.8k
  
for (i = 0; 6.26k
i < tensor_arena->vt_tensor_size;
i++91.6k
)
3932
91.6k
  {
3933
91.6k
    ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3934
    // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3935
91.6k
    if (mv && 
CCV_IS_TENSOR_MULTIVIEW83.1k
(mv))
3936
53
    {
3937
53
      const ccv_array_t* const head = tensor_blocks[i].head;
3938
53
      if (head && 
head->rnum > 047
)
3939
94
        
for (j = 0; 47
j < head->rnum;
j++47
)
3940
47
        {
3941
47
          const int idx = *(int*)ccv_array_get(head, j);
3942
47
          if (idx >= exec_symbol_info_size)
3943
1
            continue;
3944
47
          assert
(idx >= 0)46
;
3945
46
          const int d = graph_execs[idx].d;
3946
46
          ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d);
3947
46
          int flag = 0;
3948
46
          if (exec_info->tensor_wraps_ref)
3949
32
          {
3950
32
            ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1);
3951
113
            for (k = 0; k < tensor_wrap_array->size && 
!flag88
;
k++81
)
3952
81
              flag = (tensor_wrap_array->tensor_wraps[k] && 
tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv55
);
3953
32
          }
3954
          // If none is in the flag, it need to be included in the cast.
3955
46
          if (!flag)
3956
19
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3957
46
        }
3958
53
    }
3959
91.6k
  }
3960
  // Create source / destination phony node. This is to facilitate use of compiled graph.
3961
  // Also, this is needed if you have init zero execs.
3962
6.26k
  if (source_exec_created || 
source_size > 16.21k
)
3963
140
  {
3964
140
    if (!source_exec_created)
3965
92
      graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3966
577
    for (i = 0; i < source_size; 
i++437
)
3967
437
      ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3968
6.12k
  } else {
3969
6.12k
    assert(!source_exec_created);
3970
6.12k
    assert(source_size == 1);
3971
6.12k
    graph_exec_arena->source = graph_execs[sources[0].d];
3972
6.12k
  }
3973
6.26k
  if (destination_size == 1)
3974
6.17k
    graph_exec_arena->destination = graph_execs[destinations[0].d];
3975
89
  else {
3976
89
    graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3977
1.09k
    for (i = 0; i < destination_size; 
i++1.00k
)
3978
1.00k
      ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3979
89
  }
3980
6.26k
  ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3981
6.26k
  ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3982
6.26k
  return graph_exec_arena;
3983
6.26k
}
3984
3985
static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3986
11
{
3987
11
  if (graph_prep->symbolic_graph == pair)
3988
4
    return graph_prep->graph;
3989
7
  int i;
3990
10
  for (i = 0; i < graph_prep->sub_prep_size; 
i++3
)
3991
7
    if (graph_prep->sub_preps[i])
3992
7
    {
3993
7
      ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
3994
7
      if (graph)
3995
4
        return graph;
3996
7
    }
3997
3
  return 0;
3998
7
}
3999
4000
static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4001
6.21k
{
4002
6.21k
  int i;
4003
6.26k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++43
)
4004
43
    if (graph_prep->sub_preps[i])
4005
42
    {
4006
42
      if (graph_prep->sub_preps[i]->symbolic_graph->pair)
4007
4
        graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
4008
42
    }
4009
6.21k
}
4010
4011
static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4012
6.26k
{
4013
6.26k
  assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
4014
6.26k
  int i;
4015
58.4k
  for (i = 0; i < graph_prep->exec_symbol_info_size; 
i++52.1k
)
4016
52.1k
  {
4017
52.1k
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags))
4018
12
      continue;
4019
52.1k
    if (graph_exec_arena->graph_execs[i].graph && 
graph_prep->exec_symbol_info[i].pair_ref32.2k
)
4020
15.9k
    {
4021
15.9k
      ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4022
15.9k
        .d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4023
15.9k
        .graph = graph_prep->symbolic_graph->pair ? 
graph_prep->symbolic_graph->pair4
:
graph_prep->symbolic_graph15.9k
,
4024
15.9k
      });
4025
15.9k
      if (pair_exec.d >= 0)
4026
629
        ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4027
15.9k
    }
4028
52.1k
  }
4029
6.31k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
4030
50
    if (graph_prep->sub_preps[i])
4031
49
      _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4032
6.26k
}
4033
4034
static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4035
6.26k
{
4036
6.26k
  int i;
4037
6.26k
  if (graph_prep->dup_breakpoints)
4038
2
  {
4039
    // Strip the const modifier only possible because it is a sub-graph.
4040
2
    ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4041
4
    for (i = 0; i < graph_prep->dup_breakpoints->rnum; 
i++2
)
4042
2
      ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i));
4043
2
    ccv_array_free(graph_prep->dup_breakpoints);
4044
2
    graph_prep->dup_breakpoints = 0;
4045
2
    graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4046
    // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4047
2
    memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4048
    // Since exec_symbol_info changed, create a new visit object.
4049
2
    assert(symbolic_graph->sources);
4050
2
    assert(symbolic_graph->destinations);
4051
2
    ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0);
4052
2
    const int source_size = symbolic_graph->sources->rnum;
4053
2
    ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0);
4054
2
    const int destination_size = symbolic_graph->destinations->rnum;
4055
4
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new2
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
4056
0
    ccv_nnc_graph_visit_free(graph_prep->visit);
4057
4
    graph_prep->visit = visit;
4058
4
    assert(graph_prep->p);
4059
2
    ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4060
2
  }
4061
32.2k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
4062
32.2k
    for (i = 0; i < node->graph_ref_size; 
i++49
)
4063
49
    {
4064
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
4065
49
      if (graph_ref >= 0)
4066
49
        _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4067
49
    }
4068
32.2k
  } ccv_nnc_graph_visit_endfor
4069
6.26k
}
4070
4071
const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4072
4073
void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4074
6.21k
{
4075
6.21k
  assert(graph_ref);
4076
6.21k
  assert(tensor_arena_ref);
4077
6.21k
  assert(graph_exec_arena_ref);
4078
6.21k
  int i;
4079
  // Cannot bind the multi-view.
4080
53.8k
  for (i = 0; i < tensor_bind_size; 
i++47.6k
)
4081
47.6k
  {
4082
47.6k
    assert(tensor_binds[i].tensor);
4083
47.6k
    assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor));
4084
47.6k
  }
4085
6.21k
  ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4086
6.21k
  _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4087
6.21k
  ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4088
6.21k
  _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4089
6.21k
  *tensor_arena_ref = tensor_arena;
4090
  // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4091
6.21k
  _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4092
  // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4093
6.21k
  _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4094
6.21k
  *graph_ref = graph_prep->graph;
4095
6.21k
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4096
6.21k
  _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4097
6.21k
  _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4098
6.21k
  *graph_exec_arena_ref = graph_exec_arena;
4099
6.21k
  _ccv_nnc_symbolic_graph_prep_free(graph_prep);
4100
6.21k
}
4101
4102
static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4103
6.26k
{
4104
  // Buffers are inherited from above, no need to dealloc.
4105
6.26k
  int i;
4106
6.31k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
4107
50
    if (tensor_arena->sub_arenas[i])
4108
49
      _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4109
6.32k
  for (i = 0; i < tensor_arena->m_tensor_idx->rnum; 
i++61
)
4110
61
  {
4111
61
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i));
4112
61
    assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv));
4113
61
    ccv_nnc_tensor_multiview_free(*mv);
4114
61
  }
4115
6.26k
  ccv_array_free(tensor_arena->tensor_metadata);
4116
6.26k
  ccv_array_free(tensor_arena->m_tensor_idx);
4117
6.26k
  if (tensor_arena->pb_vt_tensors)
4118
83
    ccfree(tensor_arena->pb_vt_tensors);
4119
6.26k
  if (tensor_arena->vt_alias_r_refs_p)
4120
83
    ccfree(tensor_arena->vt_alias_r_refs_p);
4121
6.26k
  if (tensor_arena->vt_sizes)
4122
5
    ccfree(tensor_arena->vt_sizes);
4123
6.26k
  ccfree(tensor_arena);
4124
6.26k
}
4125
4126
void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4127
83.6k
{
4128
83.6k
  assert(tensor_arena->graph_ref == (intptr_t)symbol.graph);
4129
83.6k
  assert(symbol.d < tensor_arena->vt_tensor_size);
4130
83.6k
  assert(symbol.d >= 0);
4131
  // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4132
83.6k
  int i;
4133
83.6k
  if (!tensor_arena->pb_vt_tensors)
4134
83
  {
4135
83
    tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4136
7.87k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.78k
)
4137
7.78k
      if (tensor_arena->vt_tensors[i])
4138
6.45k
        tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4139
83
  }
4140
83.6k
  if (!tensor_arena->vt_alias_r_refs_p)
4141
83
  {
4142
83
    tensor_arena->vt_alias_r_refs_p = (int*)cccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4143
83
    tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4144
7.87k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.78k
)
4145
7.78k
      if (tensor_arena->vt_alias_refs[i])
4146
565
      {
4147
565
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4148
565
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
4149
565
        ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4150
565
      }
4151
83
    int refp = 0;
4152
7.87k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.78k
) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4153
7.78k
      if (tensor_arena->vt_alias_r_refs_p[i])
4154
560
        refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4155
7.22k
      else
4156
7.22k
        tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4157
7.30k
    for (i = refp; i < tensor_arena->vt_tensor_size; 
i++7.22k
)
4158
7.22k
      tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4159
7.87k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.78k
)
4160
7.78k
      if (tensor_arena->vt_alias_refs[i])
4161
565
      {
4162
565
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4163
565
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
4164
565
        const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4165
565
        assert(pos >= 0);
4166
565
        tensor_arena->vt_alias_r_refs[pos] = i;
4167
565
      }
4168
83
  }
4169
83.6k
  const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? 
tensor_arena->vt_alias_refs[symbol.d] - 11
:
symbol.d83.6k
;
4170
83.6k
  if (CCV_IS_TENSOR_VIEW(tensor))
4171
0
  {
4172
0
    assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4173
0
    assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&
4174
0
          ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||
4175
0
        (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info));
4176
0
  } else
4177
83.6k
    { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)); }
4178
83.6k
  if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d]))
4179
0
    { assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0); }
4180
83.6k
  tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4181
83.6k
  if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4182
5
    
for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; 2
i < tensor_arena->vt_tensor_size;
i++3
)
4183
5
    {
4184
5
      const int d = tensor_arena->vt_alias_r_refs[i];
4185
5
      if (d < 0 || 
symbol_d + 1 != tensor_arena->vt_alias_refs[d]3
) // Doesn't match, reached the end of it.
4186
2
        break;
4187
3
      ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4188
3
      d_tensor->info.datatype = tensor->info.datatype;
4189
3
      d_tensor->info.reserved = tensor->info.reserved;
4190
3
      if (CCV_IS_TENSOR_VIEW(d_tensor))
4191
1
        ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4192
2
      else {
4193
2
        d_tensor->data.u8 = tensor->data.u8;
4194
2
        d_tensor->dataof = tensor->dataof;
4195
2
      }
4196
3
    }
4197
83.6k
}
4198
4199
void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4200
14.5k
{
4201
14.5k
  if (!tensor_arena->pb_vt_tensors)
4202
38
    return;
4203
14.5k
  int i;
4204
485k
  for (i = 0; i < tensor_arena->vt_tensor_size; 
i++470k
)
4205
470k
    if (tensor_arena->vt_tensors[i])
4206
296k
      tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4207
14.5k
}
4208
4209
uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4210
2
{
4211
2
  uint64_t total_size = 0;
4212
2
  int i;
4213
36
  for (i = 0; i < tensor_arena->buffer_size; 
i++34
)
4214
34
    total_size += tensor_arena->buffers[i].size;
4215
2
  return total_size;
4216
2
}
4217
4218
static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4219
0
{
4220
0
  int i;
4221
0
  if (mv->it)
4222
0
    mv->it->info = params;
4223
0
  for (i = 0; i < mv->repeat + mv->kind; i++)
4224
0
  {
4225
0
    ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i];
4226
0
    if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4227
0
      _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4228
0
    else
4229
0
      tensor->info = params;
4230
0
  }
4231
0
}
4232
4233
int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4234
2.20k
{
4235
2.20k
  int i;
4236
2.20k
  assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size);
4237
2.20k
  if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4238
5
  {
4239
5
    tensor_arena->vt_sizes = (size_t*)ccmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4240
81
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++76
)
4241
76
      if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]52
)
4242
50
      {
4243
50
        ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4244
50
        if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4245
0
        {
4246
0
          ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4247
0
          while (CCV_IS_TENSOR_MULTIVIEW(mv))
4248
0
            mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)[0]);
4249
0
          tensor = (ccv_nnc_tensor_t*)mv;
4250
0
        }
4251
50
        tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4252
50
      }
4253
5
  }
4254
2.20k
  int flag = 0;
4255
22.2k
  for (i = 0; !flag && i < tensor_arena->vt_tensor_size; 
i++20.0k
)
4256
20.0k
    if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]17.6k
)
4257
15.6k
    {
4258
15.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4259
15.6k
      ccv_nnc_tensor_param_t params = symbol_info->info;
4260
15.6k
      params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4261
15.6k
      params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4262
15.6k
      flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4263
15.6k
    }
4264
2.20k
  if (flag)
4265
0
    return -1;
4266
22.2k
  
for (i = 0; 2.20k
i < tensor_arena->vt_tensor_size;
i++20.0k
)
4267
20.0k
    if (tensor_arena->vt_tensors[i])
4268
17.6k
    {
4269
17.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4270
17.6k
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4271
17.6k
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4272
0
      {
4273
0
        assert(!tensor_arena->vt_alias_refs[i]);
4274
0
        _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4275
17.6k
      } else if (!tensor_arena->vt_alias_refs[i]) {
4276
15.6k
        ccv_nnc_tensor_param_t params = symbol_info->info;
4277
15.6k
        params.datatype = tensor->info.datatype;
4278
15.6k
        params.reserved = tensor->info.reserved;
4279
15.6k
        tensor->info = params;
4280
15.6k
      } else {
4281
2.00k
        off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4282
2.00k
        ccv_nnc_tensor_param_t params = symbol_info->info;
4283
2.00k
        params.datatype = tensor->info.datatype;
4284
2.00k
        params.reserved = tensor->info.reserved;
4285
2.00k
        tensor->info = params;
4286
2.00k
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4287
2.00k
        ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4288
2.00k
        if (CCV_IS_TENSOR_VIEW(tensor))
4289
0
        {
4290
0
          ((ccv_nnc_tensor_view_t*)tensor)->off = off;
4291
0
          memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4292
0
        }
4293
2.00k
      }
4294
17.6k
    }
4295
  // Should handle sub_tensor_arena, don't do that at the moment.
4296
2.20k
  assert(!graph->sub_graphs);
4297
2.20k
  return 0;
4298
2.20k
}
4299
4300
void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4301
2.20k
{
4302
2.20k
  assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size);
4303
2.20k
  int i;
4304
11.0k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++8.82k
)
4305
8.82k
  {
4306
8.82k
    const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4307
8.82k
    if (graph_exec.d < 0)
4308
2.41k
      continue;
4309
6.41k
    const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4310
6.41k
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
4311
6.41k
    ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4312
6.41k
    if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4313
6.41k
    {
4314
6.41k
      new_cmd.backend = existing_cmd.backend;
4315
6.41k
      new_cmd.algorithm = existing_cmd.algorithm;
4316
6.41k
    }
4317
6.41k
    ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4318
6.41k
  }
4319
2.20k
}
4320
4321
void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4322
6.42k
{
4323
6.42k
  int i;
4324
22.8k
  for (i = 0; i < tensor_arena->buffer_size; 
i++16.4k
)
4325
16.4k
  {
4326
16.4k
    if (!tensor_arena->buffers[i].ptr)
4327
248
      continue;
4328
16.1k
    const int buffer_type = tensor_arena->buffers[i].type;;
4329
16.1k
    const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
4330
16.1k
#ifdef HAVE_CUDA
4331
16.1k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
4332
16.1k
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
4333
2.37k
    {
4334
2.37k
      if (tensor_arena->allocator.isa && 
tensor_arena->allocator.isa->free266
)
4335
266
        tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4336
2.10k
      else
4337
2.10k
        cufree(device_id, tensor_arena->buffers[i].ptr);
4338
13.7k
    } else {
4339
13.7k
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4340
13.7k
      if (tensor_arena->buffers[i].pin_mem)
4341
20
        cuhostfree(tensor_arena->buffers[i].ptr);
4342
13.7k
      else
4343
13.7k
        ccfree(tensor_arena->buffers[i].ptr);
4344
13.7k
    }
4345
#elif defined(HAVE_MPS)
4346
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
4347
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
4348
    {
4349
      // if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4350
      //  tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4351
      // else
4352
      mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4353
    } else {
4354
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4355
      ccfree(tensor_arena->buffers[i].ptr);
4356
    }
4357
#else
4358
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4359
    ccfree(tensor_arena->buffers[i].ptr);
4360
#endif
4361
16.1k
    tensor_arena->buffers[i].ptr = 0;
4362
16.1k
  }
4363
  // For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4364
6.42k
  if (tensor_arena->disposers)
4365
0
  {
4366
0
    for (i = 0; i < tensor_arena->disposers->rnum; i++)
4367
0
    {
4368
0
      ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i);
4369
0
      disposer->dispose(disposer->ptr, disposer->userdata);
4370
0
    }
4371
0
    ccv_array_free(tensor_arena->disposers);
4372
0
    tensor_arena->disposers = 0;
4373
0
  }
4374
6.42k
}
4375
4376
void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4377
6.21k
{
4378
6.21k
  ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4379
6.21k
  _ccv_nnc_tensor_arena_free(tensor_arena);
4380
6.21k
}
4381
4382
void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4383
6.26k
{
4384
6.26k
  int i;
4385
6.31k
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++50
)
4386
50
    if (graph_exec_arena->sub_arenas[i])
4387
49
      ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4388
6.26k
  ccfree(graph_exec_arena);
4389
6.26k
}