Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_symbolic_graph_compile.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#include "ccv_nnc_easy.h"
4
#include "ccv_internal.h"
5
#ifdef HAVE_CUDA
6
#include "gpu/ccv_nnc_compat.h"
7
#elif defined(HAVE_MPS)
8
#include "mps/ccv_nnc_mps.h"
9
#endif
10
#include "_ccv_nnc_graph.h"
11
#include "_ccv_nnc_symbolic_graph.h"
12
13
// MARK - Level-3 API
14
15
typedef struct {
16
  int flags;
17
  int type;
18
  int pin_mem; // This memory need to be pinned.
19
  int ref; // Reference to another tensor block. Start with 1.
20
  int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21
  int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22
  int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23
  int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24
  ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25
  uint64_t size; // The size of the tensor expected.
26
  int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27
  ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28
  ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29
  ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30
} ccv_nnc_tensor_block_t; // Tensor Arena Block
31
32
4.34M
#define IS_PRIMARY_COMPANION(idx, block) ((idx) < (uint32_t)((block).companion_ref - 1))
33
34
enum {
35
  UNASSIGNED = 0x1,
36
  ALIAS = 0x2,
37
  READ_ONLY = 0x4,
38
  WRITE_ONLY = 0x8,
39
  READ_WRITE = 0xc,
40
  ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41
  UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42
  UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43
};
44
45
#define TENSOR_EXPECT_ORDINARY(t) ((t.flags & 0x3) == 0)
46
#define TENSOR_EXPECT_SET_ORDINARY(t) (t.flags = (t.flags & ~0x3))
47
5.57M
#define TENSOR_EXPECT_UNASSIGNED(t) ((t.flags & 0x3) == UNASSIGNED)
48
6.40k
#define TENSOR_EXPECT_SET_UNASSIGNED(t) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49
3
#define TENSOR_EXPECT_UNSET_UNASSIGNED(t) (t.flags = (t.flags & ~0x1))
50
9.24M
#define TENSOR_EXPECT_ALIAS(t) ((t.flags & 0x3) == ALIAS)
51
8.59M
#define TENSOR_EXPECT_COMPUTABLE(t) (
!4.39M
TENSOR_EXPECT_ALIAS4.39M
(t) &&
!4.20M
TENSOR_EXPECT_UNASSIGNED4.20M
(t))
52
27.7k
#define TENSOR_READ_WRITE(t) (t.flags & 0xc)
53
6.48k
#define TENSOR_SET_READ_WRITE(t, rw) (t.flags = ((t.flags & ~0xc) | rw))
54
95
#define TENSOR_SET_ANONYMOUS(t) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55
#define TENSOR_IS_ANONYMOUS(t) (t.flags & ANONYMOUS)
56
180
#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57
19.7k
#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t) (t.flags & UNFOLDABLE_AS_INPUT)
58
116
#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59
13.3k
#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t) (t.flags & UNFOLDABLE_AS_OUTPUT)
60
61
118k
#define TENSOR_REQUIRE_INIT(flags) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || 
((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)118k
)
62
63
// Holds additional information about the exe nodes.
64
typedef struct {
65
  int flags;
66
} ccv_nnc_graph_exec_flag_t;
67
68
enum {
69
  CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70
};
71
72
typedef struct {
73
  int index;
74
  int oc;
75
  int type;
76
  uint64_t size;
77
} ccv_nnc_tensor_opt_t;
78
79
// We first sort the same type together (because they won't be reused at all.
80
// And then we sort by size, after that, sort by oc.
81
226k
#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82
226k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)
83
#undef more_than
84
typedef struct {
85
  int idx;
86
  int hop;
87
} ccv_nnc_tensor_hop_t;
88
225k
#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89
225k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)
90
#undef less_than
91
92
// If b has items overlap with a, a is still after b (inclusive).
93
static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94
0
{
95
0
  assert(a);
96
0
  assert(b);
97
0
  int x, y;
98
0
  for (x = 0; x < b->rnum; x++)
99
0
  {
100
0
    const int p = *(int*)ccv_array_get(b, x);
101
0
    int flag = 0;
102
    // In extreme cases where a is a superset of b, then a is still after b, we are good.
103
0
    for (y = 0; !flag && y < a->rnum; y++)
104
0
    {
105
0
      const int q = *(int*)ccv_array_get(a, y);
106
0
      flag = (p == q);
107
0
    }
108
0
    if (!flag)
109
0
      for (y = 0; y < a->rnum; y++)
110
0
      {
111
0
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y), p);
112
0
        if (!cell.i32 || cell.i32[0] == 0)
113
0
          return 0;
114
0
      }
115
0
  }
116
  // If b->rnum == 0, a is after b for sure.
117
  // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118
  // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119
0
  return (a->rnum > 0 || b->rnum == 0);
120
0
}
121
122
static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123
1.27M
{
124
1.27M
  assert(a);
125
1.27M
  assert(b);
126
1.27M
  int x, y, max_hop = 0;
127
1.34M
  for (x = 0; x < a->rnum; 
x++73.0k
)
128
1.34M
    
for (y = 0; 1.27M
y < b->rnum;
y++73.5k
)
129
1.27M
    {
130
1.27M
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x), *(int*)ccv_array_get(b, y));
131
1.27M
      if (!cell.i32 || 
cell.i32[0] == 073.5k
)
132
1.20M
        return 0;
133
73.5k
      max_hop = ccv_max(cell.i32[0], max_hop);
134
73.5k
    }
135
  // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
136
  // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
137
73.0k
  return max_hop;
138
1.27M
}
139
140
// If every a's head is deterministically after b's tail
141
static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
142
1.27M
{
143
1.27M
  return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
144
1.27M
}
145
146
typedef struct {
147
  ccv_array_t** alloc_dep;
148
  int vt_block_size;
149
  int buffer_size;
150
  int block_size;
151
  int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
152
  struct {
153
    int type; // The type from tensor blocks.
154
    int pin_mem; // Whether this is pinned memory.
155
    int flags; // The flags (currently for READ_ONLY or not).
156
    uint64_t size; // The size of the buffer allocated.
157
    int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
158
    ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
159
  }* buffers;
160
  struct {
161
    int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
162
    int block_ref; // A reference to which block in the given tensor_block to use.
163
    uint64_t offset; // The offset of this block.
164
  }* blocks;
165
} ccv_nnc_tensor_alloc_prep_t;
166
167
typedef struct ccv_nnc_symbolic_graph_prep_s {
168
  int flags;
169
  int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
170
  int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
171
  int exec_idx;
172
  int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
173
  int tensor_symbol_info_size;
174
  int exec_symbol_info_size;
175
  int tensor_block_size;
176
  int sub_prep_size;
177
  ccv_nnc_tensor_block_t* tensor_blocks;
178
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
179
  ccv_nnc_graph_exec_flag_t* exec_flags;
180
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
181
  int* dup_tensor_block_ref;
182
  ccv_nnc_graph_visit_t* visit;
183
  ccv_nnc_tensor_alloc_prep_t* alloc_prep;
184
  struct ccv_nnc_symbolic_graph_prep_s* p;
185
  struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
186
  // Structures that don't require to be freed after deallocation.
187
  const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
188
  ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
189
  ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
190
  ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
191
} ccv_nnc_symbolic_graph_prep_t;
192
193
typedef struct {
194
  int oc;
195
  ccv_array_t* itf;
196
} ccv_nnc_tensor_block_adjacent_t;
197
198
static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
199
6.22k
{
200
  // Compute how many dis-continuous buffers are needed.
201
  // We prefer to have several dis-continuous buffers instead of one big buffer because
202
  // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
203
  // to fully utilize memory.
204
6.22k
  int i, j, k;
205
6.22k
  ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloc(tensor_block_size, sizeof(ccv_array_t*));
206
6.22k
  int allocable_tensor_size = 0, available_tensor_size = 0;
207
97.6k
  for (i = 0; i < tensor_block_size; 
i++91.4k
)
208
91.4k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
209
30.1k
    {
210
      // Tensors that we need the header info.
211
30.1k
      ++available_tensor_size;
212
30.1k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
213
        // Tensors that we actually need to allocate (exclude the alias).
214
27.4k
        ++allocable_tensor_size;
215
30.1k
    }
216
6.22k
  ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
217
6.22k
  ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
218
6.22k
  ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
219
  // Overlap count.
220
97.6k
  for (i = 0; i < tensor_block_size; 
i++91.4k
)
221
91.4k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
222
1.68M
      
for (j = i + 1; 27.4k
j < tensor_block_size;
j++1.66M
)
223
1.66M
        if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j]))
224
637k
        {
225
          // Check to see if they interfere (default to yes).
226
          // If any of the i's head is deterministically later than j's tail
227
          // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
228
637k
          const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
229
637k
          if (i_hop_j > 0)
230
293
          {
231
293
            ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
232
293
            ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
233
293
          }
234
637k
          const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
235
637k
          if (j_hop_i > 0)
236
72.7k
          {
237
72.7k
            ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
238
72.7k
            ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
239
72.7k
          }
240
          // It cannot be that both i can hop to j can j can hop to i.
241
637k
          assert(!(i_hop_j > 0 && j_hop_i > 0));
242
637k
          if (!i_hop_j && 
!j_hop_i637k
&&
tensor_blocks[i].type == tensor_blocks[j].type564k
)
243
132k
          {
244
132k
            if (!adj[i].itf)
245
4.60k
              adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
246
132k
            ccv_array_push(adj[i].itf, &j);
247
132k
            ++adj[i].oc;
248
132k
            if (!adj[j].itf)
249
22.4k
              adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
250
132k
            ccv_array_push(adj[j].itf, &i);
251
132k
            ++adj[j].oc;
252
132k
          }
253
637k
        }
254
6.22k
  const int exec_dep_rows = exec_dep->rows;
255
6.22k
  ccv_matrix_free(exec_dep);
256
6.22k
  ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
257
6.22k
  int* const assigned = (int*)cccalloc(tensor_block_size, sizeof(int));
258
6.22k
  uint64_t* const allocated_offset = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
259
6.22k
  uint64_t* const allocated_size = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
260
6.22k
  uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
261
6.22k
  int num_assigned = 0; 
262
  // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
263
  // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
264
  // The first channel denotes the bytes available for allocation,
265
  // the second channel denotes the offset available for the allocation,
266
6.22k
  ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
267
6.22k
  ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
268
33.6k
  for (j = 0; j < allocable_tensor_size;)
269
27.3k
  {
270
    // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
271
27.3k
    uint64_t max_size = 0;
272
27.3k
    ccv_array_clear(opt);
273
27.3k
    int current_type = 0; // Deal with one type at a time.
274
4.00M
    for (i = 0; i < tensor_block_size; 
i++3.97M
)
275
3.97M
      if (tensor_blocks[i].size >= max_size &&
276
3.97M
        
TENSOR_EXPECT_COMPUTABLE2.08M
(tensor_blocks[i]) &&
!assigned[i]938k
&&
277
3.97M
        
IS_PRIMARY_COMPANION364k
(i, tensor_blocks[i]) &&
278
3.97M
        
(364k
!current_type364k
||
tensor_blocks[i].type == current_type336k
))
279
122k
      {
280
122k
        ccv_nnc_tensor_opt_t a = {
281
122k
          .size = tensor_blocks[i].size,
282
122k
          .index = i,
283
122k
          .oc = adj[i].oc,
284
122k
          .type = tensor_blocks[i].type,
285
122k
        };
286
122k
        assert(a.type);
287
122k
        current_type = a.type; // Now we now the primary type we should deal with.
288
122k
        if (tensor_blocks[i].companion_ref)
289
36
        {
290
36
          const int companion_ref = tensor_blocks[i].companion_ref - 1;
291
36
          a.size = ccv_max(a.size, tensor_blocks[companion_ref].size);
292
36
          a.oc += adj[companion_ref].oc;
293
36
        }
294
        // In case we have a tie, take them all in the array.
295
122k
        if (a.size > max_size)
296
31.9k
          ccv_array_clear(opt), max_size = a.size;
297
122k
        ccv_array_push(opt, &a);
298
122k
      }
299
27.3k
    assert(opt->rnum > 0);
300
    // Order opt array by the oc because type and size should be equal at this point.
301
27.3k
    _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
302
    // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
303
27.3k
    int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
304
27.3k
    uint64_t min_val[2] = {
305
27.3k
      0, 0
306
27.3k
    };
307
27.3k
    if (j > 0)
308
22.5k
    {
309
69.5k
      for (i = 0; i < opt->rnum; 
i++46.9k
)
310
58.0k
      {
311
58.0k
        ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
312
58.0k
        if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
313
28.7k
          continue;
314
        // Now, determine the order between a and c. After this, we can always check whether y
315
        // can hop to the earliest one and if the latest one can hop to x.
316
        // The earliest one will be called p and the latest one will be called q.
317
29.2k
        int p = a.index;
318
29.2k
        int q = a.index;
319
29.2k
        if (tensor_blocks[a.index].companion_ref)
320
16
        {
321
16
          const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
322
16
          if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
323
3
            continue;
324
13
          const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
325
13
          if (b_hop_p.i32 && 
b_hop_p.i32[0] > 01
)
326
1
            p = companion_ref;
327
12
          else {
328
12
            const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
329
12
            if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
330
12
              q = companion_ref;
331
0
            else { // Otherwise, b is in between p and q.
332
0
              const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
333
0
              const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
334
0
              assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0);
335
0
            }
336
12
          }
337
13
        }
338
29.2k
        assert
(tensor_blocks[q].type == tensor_blocks[p].type)29.2k
;
339
29.2k
        const int type = tensor_blocks[p].type;
340
        // y is always earlier than x, but this is hard to assert now.
341
        // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
342
        // Thus, the hop between y and x (through a) should be smallest ones.
343
        // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
344
        // out of q. For these nodes, we try to verify whether they form a connection (by checking against
345
        // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
346
29.2k
        int y_size = 0;
347
29.2k
        ccv_nnc_tensor_hop_t* const y_buf = buf;
348
96.2k
#define for_block(y, val) do { \
349
96.2k
          if (((int*)val)[0] > 0 && assigned[y] && 
tensor_blocks[y].type == type35.9k
&&
tensor_blocks[y].size >= a.size35.7k
) \
350
96.2k
            y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
351
35.7k
              .idx = y + 1, .hop = ((int*)val)[0] \
352
35.7k
            }; \
353
96.2k
        } while(0)
354
29.2k
        ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
355
29.2k
        if (y_vector)
356
96.2k
          
CCV_SPARSE_VECTOR_FOREACH17.9k
(tensor_dt, y_vector, for_block);
357
29.2k
#undef for_block
358
29.2k
        assert(y_size <= tensor_block_size);
359
29.2k
        int x_size = 0;
360
29.2k
        ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
361
76.8k
#define for_block(x, val) do { \
362
76.8k
          if (((int*)val)[0] > 0 && assigned[x] && 
tensor_blocks[x].type == type30.9k
&&
tensor_blocks[x].size >= a.size30.8k
) \
363
76.8k
            x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
364
30.8k
              .idx = x + 1, .hop = ((int*)val)[0] \
365
30.8k
            }; \
366
76.8k
        } while(0)
367
29.2k
        ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
368
29.2k
        if (x_vector)
369
76.8k
          
CCV_SPARSE_VECTOR_FOREACH15.9k
(tensor_df, x_vector, for_block);
370
29.2k
#undef for_block
371
29.2k
        assert(y_size + x_size <= tensor_block_size);
372
29.2k
        int x, y;
373
29.2k
        _ccv_nnc_sort_by_hops(y_buf, y_size, 0);
374
41.7k
        for (y = 0; y < y_size; 
y++12.4k
)
375
18.7k
        {
376
18.7k
          const int hop = exec_dep_rows + y_buf[y].hop;
377
18.7k
          if (hop >= min_hop)
378
0
            break;
379
18.7k
          const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
380
18.7k
          if (val.u64 && 
val.u64[0] >= a.size12.6k
)
381
6.24k
          {
382
6.24k
            min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
383
6.24k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
384
6.24k
            break;
385
6.24k
          }
386
18.7k
        }
387
29.2k
        _ccv_nnc_sort_by_hops(x_buf, x_size, 0);
388
41.7k
        for (x = 0; x < x_size; 
x++12.4k
)
389
15.5k
        {
390
15.5k
          const int hop = exec_dep_rows + x_buf[x].hop;
391
15.5k
          if (hop >= min_hop)
392
260
            break;
393
15.3k
          const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
394
15.3k
          if (val.u64 && 
val.u64[0] >= a.size3.52k
)
395
2.81k
          {
396
2.81k
            min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
397
2.81k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
398
2.81k
            break;
399
2.81k
          }
400
15.3k
        }
401
29.2k
        const int x_min_hop = x_buf[0].hop;
402
57.3k
        for (y = 0; y < y_size; 
y++28.1k
)
403
29.3k
        {
404
29.3k
          const int y_hop_p_v = y_buf[y].hop;
405
29.3k
          if (y_hop_p_v + x_min_hop >= min_hop)
406
1.19k
            break;
407
28.1k
          ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
408
28.1k
          if (y_vector)
409
28.1k
          {
410
74.6k
            for (x = 0; x < x_size; 
x++46.5k
)
411
49.4k
            {
412
49.4k
              const int q_hop_x_v = x_buf[x].hop;
413
49.4k
              const int hop = y_hop_p_v + q_hop_x_v;
414
49.4k
              if (hop >= min_hop)
415
420
                break;
416
49.0k
              const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
417
49.0k
              if (val.u64 && 
val.u64[0] >= a.size2.59k
)
418
2.50k
              {
419
2.50k
                min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
420
2.50k
                  min_val[0] = val.u64[0], min_val[1] = val.u64[1];
421
2.50k
                break;
422
2.50k
              }
423
49.0k
            }
424
28.1k
          }
425
28.1k
        }
426
        // If I found a place, stop, and exit.
427
29.2k
        if (min_y > 0 || 
min_x < tensor_block_size + 120.7k
)
428
11.1k
        {
429
11.1k
          min_i = i;
430
11.1k
          break;
431
11.1k
        }
432
        // There is no space to insert this block, mark it as such.
433
18.1k
        tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
434
18.1k
        if (tensor_blocks[a.index].companion_ref)
435
13
        {
436
13
          const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
437
13
          tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
438
13
        }
439
18.1k
      }
440
22.5k
    }
441
    // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
442
    // and default to largest size available.
443
27.3k
    ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i));
444
27.3k
    if (min_i == -1)
445
16.2k
    {
446
16.2k
      allocated_size[num_assigned] = a.size;
447
16.2k
      ++num_assigned;
448
16.2k
    }
449
27.3k
    int assign_group = num_assigned;
450
27.3k
    if (min_y > 0)
451
8.55k
    {
452
8.55k
      assign_group = assigned[min_y - 1];
453
      // The y and x should belong to the same assigned group.
454
8.55k
      assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group);
455
18.8k
    } else if (min_x < tensor_block_size + 1)
456
2.57k
      assign_group = assigned[min_x - 1];
457
    // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
458
27.3k
    if (min_y != 0 || 
min_x != tensor_block_size + 118.8k
)
459
11.1k
    {
460
11.1k
      uint64_t val[2] = {
461
11.1k
        min_val[0], min_val[1]
462
11.1k
      };
463
11.1k
      assert(val[0] >= a.size);
464
11.1k
      val[0] -= a.size;
465
11.1k
      val[1] = val[1] + a.size; // Move the offset to the next one.
466
11.1k
      ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
467
11.1k
    }
468
27.3k
    int strings[3];
469
27.3k
    strings[0] = a.index + 1;
470
27.3k
    int string_size = 1;
471
    // Assign out designated companion if it exist.
472
27.3k
    if (tensor_blocks[a.index].companion_ref)
473
20
    {
474
20
      const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
475
20
      assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type);
476
20
      const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
477
20
      if (b_hop_p.i32 && 
b_hop_p.i32[0] > 02
)
478
2
      {
479
4
        for (i = 0; i < string_size; 
i++2
)
480
2
          strings[i + 1] = strings[i];
481
2
        strings[0] = companion_ref + 1;
482
18
      } else {
483
18
        const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
484
18
        if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
485
18
          strings[string_size] = companion_ref + 1;
486
0
        else {
487
          // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
488
0
          assert(string_size == 2);
489
0
          strings[2] = strings[1];
490
0
          strings[1] = companion_ref + 1;
491
0
        }
492
18
      }
493
20
      ++string_size;
494
20
    }
495
    // Assign out and update oc.
496
54.7k
    
for (i = 0; 27.3k
i < string_size;
i++27.4k
)
497
27.4k
    {
498
27.4k
      const int index = strings[i] - 1;
499
      // Assign out the selected one.
500
27.4k
      assigned[index] = assign_group;
501
      // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
502
27.4k
      allocated_offset[index] = min_val[1];
503
27.4k
      if (adj[index].itf)
504
292k
        
for (k = 0; 27.0k
k < adj[index].itf->rnum;
k++265k
)
505
265k
        {
506
265k
          const int d = *(int*)ccv_array_get(adj[index].itf, k);
507
265k
          if (!assigned[d] && 
TENSOR_EXPECT_COMPUTABLE132k
(tensor_blocks[d]))
508
132k
            --adj[d].oc;
509
265k
        }
510
27.4k
    }
511
27.3k
    uint64_t val[2] = {
512
27.3k
      a.size, min_val[1]
513
27.3k
    };
514
27.3k
    uint64_t consumed_size = 0;
515
    // Go over from min_y to string_size (excluding min_x).
516
27.3k
    for (i = 0; i < string_size; 
i++0
)
517
27.3k
    {
518
27.3k
      const uint64_t size = tensor_blocks[strings[i] - 1].size;
519
27.3k
      assert(size <= a.size);
520
      // Update consumed size if it is bigger than "size".
521
27.3k
      if (size > consumed_size)
522
27.3k
      {
523
27.3k
        val[0] = size - consumed_size;
524
27.3k
        ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
525
27.3k
        consumed_size = size;
526
27.3k
        val[1] = min_val[1] + consumed_size;
527
27.3k
      }
528
      // If it consumed all the flow, break out.
529
27.3k
      if (consumed_size == a.size)
530
27.3k
        break;
531
27.3k
    }
532
54.7k
    
for (i = 0; 27.3k
i < string_size;
i++27.4k
)
533
27.4k
    {
534
27.4k
      const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
535
27.4k
      uint64_t val[2] = {
536
27.4k
        i_size, min_val[1]
537
27.4k
      };
538
27.4k
      uint64_t consumed_size = 0;
539
27.4k
      for (k = i + 1; k < string_size; 
k++0
)
540
20
      {
541
20
        const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size);
542
        // Update consumed size if it is bigger than "size".
543
20
        if (size > consumed_size)
544
20
        {
545
20
          val[0] = size - consumed_size;
546
20
          ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
547
20
          consumed_size = size;
548
20
          val[1] = min_val[1] + consumed_size;
549
20
        }
550
        // If it consumed all the flow, break out.
551
20
        if (consumed_size == i_size)
552
20
          break;
553
20
      }
554
27.4k
      val[0] = i_size - consumed_size;
555
      // Still have residual, flow it to min_x.
556
27.4k
      if (val[0] > 0)
557
27.3k
        ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
558
27.4k
    }
559
27.3k
    if (min_i == -1)
560
16.2k
    {
561
      // If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
562
16.2k
      const int p = strings[0] - 1;
563
16.2k
      const int q = strings[string_size - 1] - 1;
564
16.2k
      const int type = tensor_blocks[p].type;
565
16.2k
#define for_block(y, val) 
do 9.13k
{ \
566
9.13k
        if (((int*)val)[0] > 0 && !assigned[y] && 
tensor_blocks[y].type == type5.04k
&&
tensor_blocks[y].size <= a.size4.98k
) \
567
9.13k
        { \
568
4.98k
          tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
569
4.98k
          if (tensor_blocks[y].companion_ref) \
570
4.98k
          { \
571
3
            const int companion_ref = tensor_blocks[y].companion_ref - 1; \
572
3
            tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
573
3
          } \
574
4.98k
        } \
575
9.13k
      } while(0)
576
16.2k
      ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
577
16.2k
      if (y_vector)
578
9.13k
        
CCV_SPARSE_VECTOR_FOREACH4.59k
(tensor_dt, y_vector, for_block);
579
16.2k
#undef for_block
580
30.0k
#define for_block(x, val) do { \
581
30.0k
        if (((int*)val)[0] > 0 && !assigned[x] && 
tensor_blocks[x].type == type14.3k
&&
tensor_blocks[x].size <= a.size14.2k
) \
582
30.0k
        { \
583
14.2k
          tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
584
14.2k
          if (tensor_blocks[x].companion_ref) \
585
14.2k
          { \
586
2
            const int companion_ref = tensor_blocks[x].companion_ref - 1; \
587
2
            tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
588
2
          } \
589
14.2k
        } \
590
30.0k
      } while(0)
591
16.2k
      ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
592
16.2k
      if (x_vector)
593
30.0k
        
CCV_SPARSE_VECTOR_FOREACH9.51k
(tensor_df, x_vector, for_block);
594
16.2k
#undef for_block
595
16.2k
    }
596
27.3k
    j += string_size;
597
27.3k
  }
598
6.22k
  ccfree(tensor_block_cannot_insert);
599
6.22k
  ccfree(buf);
600
6.22k
  ccv_array_free(opt);
601
6.22k
  ccv_matrix_free(tensor_df);
602
6.22k
  ccv_matrix_free(tensor_dt);
603
54.7k
#define for_block(y, x, val) do { \
604
54.7k
    if (((uint64_t*)val)[0] > 0 && 
y > 044.6k
&&
x < tensor_block_size + 128.0k
) \
605
54.7k
    { \
606
11.3k
      if (!alloc_dep[x - 1]) \
607
11.3k
        
alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0)10.9k
; \
608
11.3k
      ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
609
11.3k
    } \
610
54.7k
  } while (0)
611
54.7k
  
CCV_SPARSE_FOREACH6.22k
(alloc, for_block);
612
6.22k
#undef for_block
613
6.22k
  ccv_matrix_free(alloc);
614
97.6k
  for (i = 0; i < tensor_block_size; 
i++91.4k
)
615
91.4k
    if (adj[i].itf)
616
27.0k
      ccv_array_free(adj[i].itf);
617
6.22k
  ccfree(adj);
618
6.22k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
619
6.22k
  alloc_prep->alloc_dep = alloc_dep;
620
6.22k
  alloc_prep->vt_block_size = tensor_block_size;
621
6.22k
  alloc_prep->buffer_size = num_assigned;
622
6.22k
  alloc_prep->block_size = available_tensor_size;
623
6.22k
  alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
624
6.22k
  alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
625
6.22k
  alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
626
6.22k
  memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
627
22.4k
  for (i = 0; i < num_assigned; 
i++16.2k
)
628
16.2k
    alloc_prep->buffers[i].size = allocated_size[i];
629
6.22k
  if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO))
630
0
  {
631
0
    size_t total_size = 0;
632
0
    for (i = 0; i < num_assigned; i++)
633
0
      total_size += allocated_size[i];
634
0
    PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size);
635
0
  }
636
6.22k
  ccfree(allocated_size);
637
6.22k
  j = 0;
638
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
639
97.6k
  for (i = 0; i < tensor_block_size; 
i++91.4k
)
640
91.4k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
641
30.1k
    {
642
30.1k
      alloc_prep->blocks[j].block_ref = i;
643
30.1k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
644
27.4k
      {
645
27.4k
        alloc_prep->vt_blocks[i] = j;
646
        // Also, set its allocations.
647
27.4k
        assert(assigned[i] > 0);
648
27.4k
        const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
649
27.4k
        alloc_prep->blocks[j].offset = allocated_offset[i];
650
27.4k
        if (!alloc_prep->buffers[buffer_ref].type)
651
16.2k
          alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
652
27.4k
        alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || 
tensor_blocks[i].pin_mem27.3k
;
653
27.4k
        alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i]);
654
27.4k
        assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size);
655
27.4k
      } else {
656
2.71k
        alloc_prep->vt_blocks[i] = -1;
657
2.71k
        alloc_prep->blocks[j].buffer_ref = -1;
658
2.71k
        alloc_prep->blocks[j].offset = 0;
659
2.71k
      }
660
30.1k
      ++j;
661
30.1k
    } else
662
61.3k
      alloc_prep->vt_blocks[i] = -1;
663
6.22k
  ccfree(allocated_offset);
664
6.22k
  ccfree(assigned);
665
6.22k
  return alloc_prep;
666
6.22k
}
667
668
static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
669
6.22k
{
670
6.22k
  int i;
671
97.6k
  for (i = 0; i < alloc_prep->vt_block_size; 
i++91.4k
)
672
91.4k
    if (alloc_prep->alloc_dep[i])
673
10.9k
      ccv_array_free(alloc_prep->alloc_dep[i]);
674
22.4k
  for (i = 0; i < alloc_prep->buffer_size; 
i++16.2k
)
675
16.2k
    if (alloc_prep->buffers[i].dup_p_refs)
676
13
      ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
677
6.22k
  ccfree(alloc_prep->alloc_dep);
678
6.22k
  ccfree(alloc_prep);
679
6.22k
}
680
681
// Simple allocator from ccv_array_t.
682
static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
683
76.7k
{
684
76.7k
  int pos = tensor_metadata->rnum;
685
76.7k
  int rsize = (size + 15) / 16;
686
76.7k
  ccv_array_resize(tensor_metadata, pos + rsize);
687
76.7k
  return (pos << 1) + 1;
688
76.7k
}
689
690
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
691
163k
{
692
163k
  assert((pos >> 1) < tensor_metadata->rnum);
693
163k
  return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1);
694
163k
}
695
696
83.6k
#define CCV_NNC_IS_METADATA_POS(ptr) ((uintptr_t)(
ptr590
) & 1)
697
698
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
699
83.1k
{
700
  // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
701
83.1k
  if (!CCV_NNC_IS_METADATA_POS(vt_tensor))
702
0
    return vt_tensor;
703
83.1k
  ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
704
83.1k
  if (tensor->alias_ref && 
CCV_NNC_IS_METADATA_POS100
(tensor->alias_ref))
705
80
  {
706
80
    const int alias_ref = tensor->alias_ref;
707
80
    tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
708
80
    _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
709
80
  }
710
83.1k
  if (CCV_IS_TENSOR_MULTIVIEW(tensor))
711
84
  {
712
84
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
713
84
    int i;
714
84
    const int count = mv->kind + mv->repeat;
715
267
    for (i = 0; i < count; 
i++183
)
716
183
    {
717
183
      if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
718
147
      {
719
147
        const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i];
720
147
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
721
147
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
722
147
      }
723
183
    }
724
    // No need to recursively do parent pointer, otherwise we are in deep rewire.
725
84
    if (mv->p && 
CCV_NNC_IS_METADATA_POS11
(mv->p))
726
0
      mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
727
84
    if (mv->sp)
728
65
      
for (i = 0; 28
i < mv->sp->rnum;
i++37
)
729
37
      {
730
37
        ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
731
37
        if (CCV_NNC_IS_METADATA_POS(*tensor))
732
30
        {
733
30
          const int pos = (int)(intptr_t)*tensor;
734
30
          *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
735
30
          assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
736
30
          _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
737
30
        }
738
37
      }
739
84
  }
740
83.1k
  return tensor;
741
83.1k
}
742
743
typedef struct {
744
  const uint8_t* ptr;
745
  int pos;
746
} ccv_nnc_tensor_block_pos_t;
747
748
static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
749
114
{
750
114
  int i;
751
114
  int unref_block_ref = block_ref;
752
120
  while (prep->tensor_blocks[unref_block_ref].ref)
753
6
    unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
754
114
  int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
755
114
  assert(vt_ref >= 0);
756
114
  assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref);
757
114
  const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
758
114
  uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
759
114
  int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
760
114
  for (i = idx - 1; i >= 0; 
i--0
)
761
114
  {
762
114
    assert(p_ref >= 0);
763
114
    const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
764
114
    const int unroll_count = graph_prep->unroll_count;
765
114
    if (ch[i]) // Prefer the dup side of things.
766
12
      p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
767
114
    int unref_p_ref = p_ref;
768
114
    while (graph_prep->tensor_blocks[unref_p_ref].ref)
769
0
      unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
770
114
    vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
771
114
    const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
772
114
    offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
773
    // If the buffer already exists, prefer that.
774
114
    const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
775
114
    if (ptr)
776
114
    {
777
      // If I have any remaining path that is not covered from 0, I cannot possibly
778
      // have any pointer from buffer (that can only happen if it is not dup).
779
138
      for (--i; i >= 0; 
i--24
)
780
24
        if (ch[i] != 0)
781
0
          return 0;
782
      // Try to find the created tensor block pos in the array, just linear scan.
783
114
      const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
784
114
      ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
785
114
      *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
786
114
      ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
787
114
      return tv_pos;
788
114
    }
789
0
    p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
790
0
  }
791
0
  return 0;
792
114
}
793
794
// Descent from root to the prep level, and compose multiview from there.
795
static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
796
114
{
797
114
  assert(pos_ref);
798
114
  int i;
799
114
  const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
800
114
  const int unroll_count = prep->unroll_count;
801
114
  if (prep == graph_prep)
802
57
  {
803
57
    const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
804
57
    if (!data_pos)
805
0
      return -1;
806
    // Based on ch, go all the way back to find the exact pointer to compose.
807
57
    if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
808
57
      prep->dup_tensor_block_ref &&
809
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] >= 041
&&
810
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref41
)
811
41
    {
812
41
      int pos[unroll_count + 1];
813
41
      pos[0] = data_pos;
814
98
      for (i = 0; i < unroll_count; 
i++57
)
815
57
        pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
816
41
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
817
41
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
818
41
      ccv_nnc_tensor_t* data[unroll_count + 1];
819
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
820
98
        data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
821
41
      ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
822
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
823
98
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
824
41
      *pos_ref = mv_pos;
825
41
    } else {
826
16
      *pos_ref = data_pos;
827
16
    }
828
57
    if (preserve)
829
5
    {
830
      // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
831
      // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
832
      // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
833
      // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
834
      // arena allocated).
835
      // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
836
      // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
837
      // it to a K01 structure.
838
      // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
839
      // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
840
      // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
841
5
      int prev_mv_pos = *pos_ref;
842
5
      if (prev_mv_pos == -1)
843
0
      {
844
0
        prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
845
0
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
846
0
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
847
0
        ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
848
0
          tv,
849
0
        }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
850
0
        CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
851
0
      }
852
5
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
853
5
      ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
854
5
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
855
5
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
856
5
        CCV_NNC_TENSOR_PLACEHOLDER,
857
5
        (ccv_nnc_tensor_t*)prev_mv,
858
5
      }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
859
5
      prev_mv->p = (void*)(intptr_t)mv_pos;
860
5
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
861
5
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
862
5
      *pos_ref = mv_pos;
863
5
    }
864
57
    return 0;
865
57
  }
866
57
  ch[idx] = 0;
867
57
  int pos[unroll_count + 1];
868
57
  pos[0] = 0;
869
57
  const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
870
57
  assert(retval == 0);
871
67
  
for (i = 0; 57
i < unroll_count;
i++10
)
872
10
  {
873
10
    ch[idx] = i + 1;
874
10
    pos[i + 1] = 0;
875
10
    const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
876
10
    if (dup_retval < 0)
877
0
    {
878
0
      assert(i == 0);
879
0
      break;
880
0
    }
881
10
  }
882
  // If current prep has no dup.
883
57
  if (i == 0)
884
47
  {
885
47
    *pos_ref = pos[0];
886
47
    return 0;
887
47
  }
888
10
  ccv_nnc_tensor_t* data[unroll_count + 1];
889
  // Compose to a new multiview.
890
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
891
20
    { assert(pos[i] > 0); }
892
10
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
893
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
894
20
    data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
895
10
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
896
10
  ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
897
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
898
20
    if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER && CCV_IS_TENSOR_MULTIVIEW(data[i]))
899
4
      ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
900
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
901
20
    CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
902
10
  *pos_ref = mv_pos;
903
10
  return 0;
904
10
}
905
906
static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
907
312
{
908
312
  int i;
909
312
  int is_input = 0;
910
312
  assert(node);
911
766
  
for (i = 0; 312
i < node->input_size &&
!is_input529
;
i++454
)
912
454
    if (p_ref == node->inputs[i])
913
153
      is_input = 1;
914
312
  int is_output = 0;
915
725
  for (i = 0; i < node->output_size && 
!is_output465
;
i++413
)
916
413
    if (p_ref == node->outputs[i])
917
167
      is_output = 1;
918
  // Prefer it is an output if it is both the input and the output.
919
312
  if (is_output)
920
167
    return 1;
921
145
  if (is_input)
922
145
    return -1;
923
0
  return 0;
924
145
}
925
926
static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
927
61
{
928
  // No need to check whether to preserve if this is not a while loop.
929
61
  if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
930
8
    return 0;
931
61
  assert
(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)53
;
932
  // If it is unassigned, no need to preserve.
933
53
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
934
2
    return 0;
935
51
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
936
  // If p is not input, no need to preserve at all.
937
51
  if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
938
19
    return 0;
939
32
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
940
32
  assert(vt_ref >= 0);
941
32
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
942
32
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
943
  // If the buffer is a truly read-only one, no need to preserve.
944
32
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
945
6
    return 0;
946
  /* This needs detailed explanation, what does preserve mean?
947
   * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
948
   * also used outside of the while loop, we cannot reuse the memory region of x for
949
   * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
950
   * y uses the same memory region as x). The way to workaround this is by using a different
951
   * memory region for y = x + 1, but for the first iteration, having x pointing to the
952
   * original. During the allocation process, the way to identify whether x should preserve
953
   * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
954
   * parent tensor is the same as the memory region it plans to use in the buffer, then we are
955
   * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
956
   * it is the input tensor whenever that is possible. A tensor block can point to two parent
957
   * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
958
   * tensor whenever that is possible. */
959
26
  if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
960
15
    return 0;
961
  // Otherwise, return 1 because we now need to preserve.
962
11
  return 1;
963
26
}
964
965
static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
966
58
{
967
58
  assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size);
968
  // If it is unassigned, no need to preserve.
969
58
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
970
0
    return 0;
971
  // Only tape var need to force broadcast, otherwise we already share the same memory region.
972
58
  if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
973
54
    return 0;
974
4
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
975
  // If p is not output, no need to broadcast at all.
976
4
  if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
977
3
    return 0;
978
1
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
979
1
  assert(vt_ref >= 0);
980
1
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
981
1
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
982
  // If the buffer is a truly read-only one, no need to broadcast.
983
1
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
984
0
    return 0;
985
  // Otherwise, return 1 because we now need to force broadcast for this tape var.
986
1
  return 1;
987
1
}
988
989
static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
990
25
{
991
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
992
25
  int i;
993
78
  for (i = 0; i < mv->kind + mv->repeat; 
i++53
)
994
53
    if (CCV_NNC_MULTIVIEW_DATA(mv)[i] == CCV_NNC_TENSOR_PLACEHOLDER)
995
8
      CCV_NNC_MULTIVIEW_DATA(mv)[i] = tensor;
996
45
    else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
997
7
      _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], tensor);
998
25
}
999
1000
static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1001
25
{
1002
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1003
25
  int i;
1004
25
  if (mv->sp)
1005
8
    
for (i = 0; 2
i < mv->sp->rnum;
i++6
)
1006
6
    {
1007
6
      ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
1008
6
      if (CCV_NNC_IS_METADATA_POS(*tensor))
1009
1
      {
1010
1
        const int pos = (int)(intptr_t)*tensor;
1011
1
        *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1012
1
        assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
1013
1
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1014
1
      }
1015
6
    }
1016
78
  
for (i = 0; 25
i < mv->kind + mv->repeat;
i++53
)
1017
53
  {
1018
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1019
8
      CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1020
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref))
1021
0
      CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref);
1022
53
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1023
7
      _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1024
53
  }
1025
25
}
1026
1027
static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1028
47
{
1029
  // Go to the root of the graph.
1030
47
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1031
47
  int i;
1032
104
  for (i = 1; prep->p; 
i++57
)
1033
57
    prep = prep->p;
1034
  // Root graph should have no dup tensor blocks.
1035
47
  assert(!prep->dup_tensor_block_ref);
1036
47
  const int c = i;
1037
47
  const ccv_nnc_symbolic_graph_prep_t* preps[c];
1038
47
  prep = graph_prep;
1039
47
  preps[c - 1] = prep;
1040
104
  for (i = 0; prep->p; 
i++57
)
1041
57
    preps[c - 2 - i] = prep = prep->p;
1042
47
  int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1043
47
  memset(ch, 0, sizeof(int) * c);
1044
47
  int pos = 0;
1045
47
  _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1046
47
  assert(ch[c - 1] == 0); // This shouldn't never be modified.
1047
47
  assert(pos > 0);
1048
47
  return pos;
1049
47
}
1050
1051
static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1052
3
{
1053
3
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1054
3
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1055
3
  ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : 
tensor0
;
1056
3
  ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1057
3
    CCV_NNC_TENSOR_PLACEHOLDER,
1058
3
    tv,
1059
3
  }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1060
3
  CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
1061
3
  CCV_NNC_MULTIVIEW_DATA(mv)[1] = tensor;
1062
3
  return mv_pos;
1063
3
}
1064
1065
static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1066
30
{
1067
30
  ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1068
30
  const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr);
1069
30
  if (!is_multiview)
1070
18
    return pos;
1071
24
  
while (12
CCV_IS_TENSOR_MULTIVIEW(tensor_ptr))
1072
12
  {
1073
12
    const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1074
12
    tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1075
12
  }
1076
12
  const ccv_nnc_tensor_t tensor = *tensor_ptr;
1077
12
  const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1078
12
  ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1079
12
  *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1080
12
  new_tensor->dataof = tensor.dataof;
1081
12
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1082
12
  new_tensor->alias_ref = (uintptr_t)pos;
1083
12
  ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1084
12
  return new_pos;
1085
30
}
1086
1087
static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1088
2.69k
{
1089
2.69k
  const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1090
  // It referenced to is not an alias.
1091
2.69k
  assert(vt_tensors[alias_ref]);
1092
2.69k
  const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1093
2.69k
  const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1094
2.69k
  assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr));
1095
  // Will use that to determine whether insert reference or not.
1096
2.69k
  const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr);
1097
2.70k
  while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr))
1098
13
  {
1099
13
    const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1100
13
    alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1101
13
  }
1102
2.69k
  const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1103
  // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1104
2.69k
  int pos;
1105
2.69k
  if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1106
2.69k
    
ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim)2.66k
)
1107
2.63k
  {
1108
2.63k
    pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1109
2.63k
    ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1110
2.63k
    *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1111
2.63k
    tensor->dataof = alias_tensor.dataof;
1112
2.63k
  } else {
1113
59
    pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1114
59
    ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1115
    // Otherwise initialize a tensor view
1116
59
    *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1117
59
    tensor_view->alias_ref = (uintptr_t)alias_pos;
1118
59
  }
1119
2.69k
  vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1120
2.69k
  if (is_multiview)
1121
13
  {
1122
13
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1123
13
    ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1124
13
  }
1125
2.69k
}
1126
1127
static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1128
2.69k
{
1129
  // If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1130
2.69k
  if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]) && 
tensor_blocks[block_ref].alias_ref3
&&
!vt_tensors[block_ref]3
)
1131
3
  {
1132
3
    const int ref = tensor_blocks[block_ref].alias_ref - 1;
1133
3
    if (!vt_tensors[ref])
1134
0
      _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1135
3
    vt_tensors[block_ref] = vt_tensors[ref];
1136
3
    return;
1137
3
  }
1138
2.69k
  assert
(tensor_symbol_info[block_ref].alias_ref)2.69k
;
1139
2.69k
  const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1140
  // If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1141
2.69k
  if (!vt_tensors[alias_ref])
1142
3
    _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1143
2.69k
  _ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1144
2.69k
}
1145
1146
// Turn a linear pointer to an object storage (such as MTLBuffer).
1147
#ifdef HAVE_MPS
1148
static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1149
{
1150
  mpobjfree(0, ptr);
1151
}
1152
#endif
1153
1154
typedef struct {
1155
  size_t size;
1156
  void* obj;
1157
} tensor_arena_obj_track_t;
1158
1159
typedef struct {
1160
  void* ptr;
1161
  off_t offset;
1162
  size_t size;
1163
} obj_ptr_key_t;
1164
1165
static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1166
0
{
1167
0
  return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1168
0
}
1169
1170
static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1171
0
{
1172
0
  return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1173
0
}
1174
1175
KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)
1176
1177
static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1178
27.2k
{
1179
27.2k
  if (params.dim[0] == 0)
1180
0
    return 0;
1181
#ifdef HAVE_MPS
1182
  if (CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_GPU_MEMORY)
1183
  {
1184
    int ret;
1185
    const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype) * ccv_nnc_tensor_count(params);
1186
    const obj_ptr_key_t key = {
1187
      .ptr = ptr,
1188
      .offset = offset,
1189
      .size = size,
1190
    };
1191
    khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret);
1192
    if (ret != 0)
1193
    {
1194
      void* obj = mpobjcreate(ptr, offset, size);
1195
      if (!tensor_arena->disposers)
1196
        tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1197
      ccv_nnc_arena_disposer_t disposer = {
1198
        .ptr = obj,
1199
        .userdata = 0,
1200
        .dispose = _ccv_nnc_tensor_arena_obj_dispose
1201
      };
1202
      ccv_array_push(tensor_arena->disposers, &disposer);
1203
      kh_val(obj_ptr_map, k) = obj;
1204
      return obj;
1205
    } else
1206
      return kh_val(obj_ptr_map, k);
1207
  }
1208
#endif
1209
27.2k
  return ptr + offset;
1210
27.2k
}
1211
1212
static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1213
6.22k
{
1214
  // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1215
  // Each tensor have the designation in assigned array, and offset in allocated_offset.
1216
6.22k
  const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1217
6.22k
  ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1218
6.22k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1219
6.22k
  const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1220
6.22k
  const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1221
6.22k
  const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? 
p_graph_prep->alloc_prep49
:
06.17k
;
1222
6.22k
  const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1223
6.22k
  const int unroll_count = graph_prep->unroll_count;
1224
6.22k
  int i, j;
1225
97.5k
  for (i = 0; i < tensor_symbol_info_size; 
i++91.2k
)
1226
91.2k
    
for (j = 0; 91.2k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) &&
j < unroll_count61.3k
;
j++7
)
1227
7
    {
1228
7
      const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1229
7
      if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref]))
1230
3
        TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i]);
1231
7
    }
1232
6.22k
  ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1233
6.22k
  graph_prep->tensor_arena = tensor_arena;
1234
6.22k
  tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1235
6.22k
  tensor_arena->buffers = (void*)(tensor_arena + 1);
1236
6.22k
  tensor_arena->buffer_size = alloc_prep->buffer_size;
1237
6.22k
  tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1238
6.22k
  tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1239
6.22k
  tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1240
6.22k
  tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1241
6.22k
  tensor_arena->pb_vt_tensors = 0;
1242
6.22k
  tensor_arena->vt_alias_r_refs_p = 0;
1243
6.22k
  tensor_arena->vt_alias_r_refs = 0;
1244
6.22k
  tensor_arena->vt_sizes = 0;
1245
6.22k
  tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1246
6.22k
  tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
1247
6.22k
  tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1248
6.22k
  tensor_arena->allocator.context.free = allocator.context.free;
1249
6.22k
  tensor_arena->allocator.isa = allocator.isa;
1250
6.22k
  tensor_arena->disposers = 0;
1251
  // Copy alias_ref info back to the tensor arena.
1252
97.5k
  for (i = 0; i < tensor_symbol_info_size; 
i++91.2k
)
1253
91.2k
    tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1254
  // Do the buffer copies.
1255
22.4k
  for (i = 0; i < alloc_prep->buffer_size; 
i++16.2k
)
1256
16.2k
    tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1257
16.2k
      tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1258
16.2k
      tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1259
6.22k
  if (graph_prep->while_count_tensor)
1260
19
  {
1261
    // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1262
19
    int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1263
19
    assert((0 << 1) + 1 == pos); // pos must be 0 position.
1264
19
    ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1265
19
    *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1266
19
  }
1267
6.22k
  assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep));
1268
6.22k
  if (p_arena && 
p_graph_prep49
)
1269
49
  {
1270
    // Don't need to allocate the actual buffer, just use the pointer from the above.
1271
49
    PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena);
1272
229
    for (i = 0; i < tensor_arena->buffer_size; 
i++180
)
1273
180
    {
1274
180
      const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1275
180
      int unref_p_ref = p_ref;
1276
182
      while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1277
2
        unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1278
180
      assert(unref_p_ref >= 0);
1279
180
      const int p_unroll_count = p_graph_prep->unroll_count;
1280
180
      if (p_graph_prep->dup_tensor_block_ref &&
1281
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 016
&&
1282
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref16
)
1283
10
      {
1284
        // This condition means in the parent graph, we point to multiple tensor blocks for the same
1285
        // buffer, therefore, we cannot have one single pointer assigned in this case.
1286
        // Later we will handle this by generate ccv_tensor_multiview_t structure.
1287
10
        tensor_arena->buffers[i].ptr = 0;
1288
10
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1289
10
        continue;
1290
10
      }
1291
      // Otherwise, find the actual buffer pointer.
1292
170
      const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1293
170
      assert(vt_ref >= 0);
1294
170
      const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1295
170
      if (!p_arena->buffers[buffer_ref].ptr)
1296
0
      {
1297
        // Pass it down as 0 ptr.
1298
0
        tensor_arena->buffers[i].ptr = 0;
1299
0
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1300
0
        continue;
1301
0
      }
1302
170
      const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1303
170
      tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1304
170
      PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset);
1305
170
    }
1306
6.17k
  } else {
1307
    // Now, allocate actual buffers.
1308
6.17k
    PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena);
1309
22.2k
    for (i = 0; i < tensor_arena->buffer_size; 
i++16.0k
)
1310
16.0k
    {
1311
16.0k
      const int buffer_type = tensor_arena->buffers[i].type;
1312
16.0k
      const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
1313
16.0k
#ifdef HAVE_CUDA
1314
16.0k
      if (memory_type == CCV_TENSOR_GPU_MEMORY)
1315
2.35k
      {
1316
2.35k
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
1317
2.35k
        if (allocator.isa && 
allocator.isa->alloc266
)
1318
266
          tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1319
2.09k
        else
1320
2.09k
          tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1321
2.35k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1322
13.7k
      } else {
1323
13.7k
        assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1324
13.7k
        if (tensor_arena->buffers[i].pin_mem)
1325
17
          tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1326
13.7k
        else
1327
13.7k
          ccmemalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1328
13.7k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1329
13.7k
      }
1330
#elif defined(HAVE_MPS)
1331
      if (memory_type == CCV_TENSOR_GPU_MEMORY)
1332
      {
1333
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
1334
        // if (allocator.isa && allocator.isa->alloc)
1335
        //  tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1336
        // else
1337
        tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1338
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1339
      } else {
1340
        assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1341
        ccmemalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1342
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1343
      }
1344
#else
1345
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1346
      ccmemalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1347
      PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1348
#endif
1349
16.0k
      assert(tensor_arena->buffers[i].ptr);
1350
16.0k
    }
1351
6.17k
  }
1352
  // Go over sub_preps and allocate arenas for them. Do it this early because
1353
  // we may reference tensors from sub arenas, the reason why we need to reference
1354
  // tensors from sub arenas is because for output tensors, sub arena's tensor
1355
  // will have automatic reference updates.
1356
6.27k
  
for (i = 0; 6.22k
i < tensor_arena->sub_arena_size;
i++50
)
1357
50
    if (graph_prep->sub_preps[i])
1358
49
      tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1359
1
    else
1360
1
      tensor_arena->sub_arenas[i] = 0;
1361
6.22k
  memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1362
  // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1363
6.22k
  ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? 
(ccv_nnc_tensor_t**)29
cccalloc29
(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) :
06.19k
;
1364
#ifdef HAVE_MPS
1365
  khash_t(obj_ptr)* obj_ptr_map = kh_init(obj_ptr);
1366
#else
1367
6.22k
  khash_t(obj_ptr)* obj_ptr_map = 0;
1368
6.22k
#endif
1369
6.27k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1370
50
    if (tensor_arena->sub_arenas[i])
1371
49
    {
1372
49
      assert(graph_prep->sub_preps[i]);
1373
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1374
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1375
49
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1376
45
        
for (j = 0; 21
j < node->output_size;
j++24
)
1377
24
        {
1378
24
          const int idx = node->outputs[j];
1379
24
          const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i) - 1;
1380
24
          assert(s_idx >= 0);
1381
24
          ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1382
24
          assert(sub_arena_out_tensors[idx] == 0);
1383
24
          ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1384
          // Only assign if it is a multiview tensor.
1385
24
          if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor) ||
1386
24
            
(8
sub_alias8
&&
CCV_IS_TENSOR_MULTIVIEW1
(sub_alias)))
1387
17
            sub_arena_out_tensors[idx] = sub_tensor;
1388
24
        }
1389
49
    }
1390
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
1391
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1392
91.2k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
1393
27.2k
    {
1394
27.2k
      const int vt_ref = alloc_prep->vt_blocks[i];
1395
27.2k
      const int buffer_ref = vt_ref >= 0 ? 
alloc_prep->blocks[vt_ref].buffer_ref27.2k
:
-13
;
1396
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1397
      // previous layer, therefore, cannot really find the buffer ptr.
1398
27.2k
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[i]101
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1399
27.2k
        
(27.2k
(27.2k
graph_prep->dup_tensor_block_ref27.2k
&&
1400
27.2k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] >= 059
&&
1401
27.2k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] != i57
) ||
1402
27.2k
         
(27.2k
buffer_ref >= 027.2k
&&
!tensor_arena->buffers[buffer_ref].ptr27.2k
)))
1403
47
      {
1404
47
        assert(graph_prep->p); // This must be in a sub-graph.
1405
        // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1406
47
        if (graph_prep->tensor_blocks[i].p_refs[0] && 
_ccv_nnc_tensor_block_check_preserve(graph_prep, i)36
)
1407
4
          continue;
1408
43
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1409
43
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1410
43
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1411
27.2k
      } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])) {
1412
        // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1413
27.2k
        const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1414
        // If already created, use the same tensor, and continue.
1415
        // Having ptr.
1416
27.2k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1417
27.2k
        ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1418
        // Also, set its allocations.
1419
        // Since tensor view is bit compatible with tensor, we can just cast.
1420
27.2k
        void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1421
27.2k
        *tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1422
27.2k
        assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size);
1423
        // If we need to force broadcast, we need to wrap it in a multiview.
1424
27.2k
        if (graph_prep->tensor_blocks[i].p_refs[0] &&
1425
27.2k
          
_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i)58
)
1426
1
        {
1427
1
          const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1428
1
          ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1429
1
          ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1430
1
          ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1431
1
            tv,
1432
1
          }, 0, 1, graph_prep->graph, mv);
1433
1
          CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1434
1
          pos = mv_pos;
1435
1
          ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1436
1
        }
1437
27.2k
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1438
27.2k
      }
1439
27.2k
    }
1440
#ifdef HAVE_MPS
1441
  kh_destroy(obj_ptr, obj_ptr_map);
1442
#endif
1443
  // Handle binded tensors. First handle cases without aliases.
1444
53.8k
  
for (i = 0; 6.22k
i < tensor_bind_size;
i++47.5k
)
1445
47.5k
  {
1446
47.5k
    assert(tensor_binds[i].tensor);
1447
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1448
47.5k
    if (resolved_symbol.d >= 0)
1449
47.5k
    {
1450
47.5k
      int d = resolved_symbol.d;
1451
47.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1452
1.02k
        continue;
1453
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1454
      // It has nothing to do with alias.
1455
46.7k
      
while (46.5k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1456
146
        d = tensor_blocks[d].ref - 1;
1457
      // For binded tensors, it shouldn't be assigned yet.
1458
      // If it is assigned, the pointer should match the ones from the binded tensor.
1459
      // This can only happen if an enforced in-place tensor is binded twice. If that
1460
      // happens, we need to make sure it is binded to the same location.
1461
46.5k
      assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1462
      // See above assertion.
1463
46.5k
      if (tensor_arena->vt_tensors[d])
1464
0
        continue;
1465
46.5k
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1466
0
      {
1467
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1468
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1469
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1470
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1471
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1472
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1473
        // It is OK to be just as a whole smaller or equal to the binded one.
1474
0
        assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info));
1475
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1476
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1477
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1478
46.5k
      } else {
1479
46.5k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1480
46.5k
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1481
46.5k
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1482
46.5k
        tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1483
46.5k
        tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1484
46.5k
        tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1485
46.5k
        tv->dataof = tensor_binds[i].tensor->dataof;
1486
46.5k
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1487
46.5k
      }
1488
46.5k
    }
1489
47.5k
  }
1490
  // Handle binded tensors. We handle alias here so it can reference to binded tensors.
1491
53.8k
  
for (i = 0; 6.22k
i < tensor_bind_size;
i++47.5k
)
1492
47.5k
  {
1493
47.5k
    assert(tensor_binds[i].tensor);
1494
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1495
47.5k
    if (resolved_symbol.d >= 0)
1496
47.5k
    {
1497
47.5k
      int d = resolved_symbol.d;
1498
47.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1499
1.02k
        d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1500
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1501
      // It has nothing to do with alias.
1502
47.7k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1503
146
        d = tensor_blocks[d].ref - 1;
1504
47.5k
      if (tensor_arena->vt_tensors[d])
1505
47.5k
        continue;
1506
      // Assert original alias has no ofs. Otherwise our binding will be problematic.
1507
26
      
for (j = 0; 2
j < CCV_NNC_MAX_DIM_ALLOC;
j++24
)
1508
24
        { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0); }
1509
2
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1510
0
      {
1511
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1512
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1513
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1514
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1515
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1516
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1517
        // It is OK to be just as a whole smaller or equal to the binded one.
1518
0
        assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info));
1519
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1520
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1521
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1522
2
      } else {
1523
2
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1524
2
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1525
2
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1526
2
        tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1527
2
        tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1528
2
        tv->data = tensor_binds[i].tensor->data;
1529
2
        tv->dataof = tensor_binds[i].tensor->dataof;
1530
2
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1531
2
      }
1532
2
    }
1533
47.5k
  }
1534
  // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1535
  // Avoiding refs that actually is an alias.
1536
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1537
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1538
91.2k
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_blocks[i].ref61.3k
&&
!tensor_arena->vt_tensors[i]6.37k
&&
!tensor_blocks[i].alias_ref6.37k
)
1539
6.20k
    {
1540
6.20k
      int ref = tensor_blocks[i].ref - 1;
1541
6.20k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
tensor_blocks[ref].ref149
)
1542
1
        ref = tensor_blocks[ref].ref - 1;
1543
6.20k
      assert(tensor_arena->vt_tensors[ref]);
1544
6.20k
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1545
6.20k
    }
1546
  // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1547
6.22k
  if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1548
21
  {
1549
21
    assert(graph_prep->p);
1550
21
    const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1551
21
    const int p_idx = graph_prep->p_idx - 1;
1552
46
    for (i = 0; i < node->input_size; 
i++25
)
1553
25
    {
1554
25
      const int idx = node->inputs[i];
1555
25
      int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx) - 1;
1556
25
      assert(!tensor_blocks[block_ref].ref);
1557
25
      const int vt_ref = alloc_prep->vt_blocks[block_ref];
1558
25
      if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1559
18
        continue;
1560
25
      assert
(vt_ref >= 0)7
;
1561
7
      const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1562
7
      assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]));
1563
7
      assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]));
1564
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1565
      // previous layer, therefore, cannot really find the buffer ptr.
1566
7
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[block_ref]0
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1567
7
        ((graph_prep->dup_tensor_block_ref &&
1568
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 04
&&
1569
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref4
) ||
1570
7
         
!tensor_arena->buffers[buffer_ref].ptr3
))
1571
4
      {
1572
        // We haven't allocated anything for this yet.
1573
4
        assert(tensor_arena->vt_tensors[block_ref] == 0);
1574
4
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1575
4
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1576
4
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1577
4
      } else {
1578
3
        const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1579
3
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1580
3
        ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1581
3
      }
1582
7
    }
1583
21
  }
1584
  // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1585
  // This created the multi-view tensor to achieve that.
1586
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1587
91.2k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1588
10
    {
1589
10
      const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1590
      // Create phi multi-view.
1591
10
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1592
10
      const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1593
10
      const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1594
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1595
10
      ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1596
10
      ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1597
10
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1598
10
        intv,
1599
10
        outv,
1600
10
      }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI, mv);
1601
10
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1602
10
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1603
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1604
10
      ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1605
10
    }
1606
  // Now it is time to handle alias.
1607
36.3k
  for (i = 0; i < alloc_prep->block_size; 
i++30.1k
)
1608
30.1k
    if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1609
29.9k
    {
1610
29.9k
      const int block_ref = alloc_prep->blocks[i].block_ref;
1611
29.9k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]) && 
!tensor_arena->vt_tensors[block_ref]2.69k
)
1612
2.69k
      {
1613
        // Assigning out the tensor aliases.
1614
2.69k
        assert(tensor_symbol_info[block_ref].alias_ref);
1615
2.69k
        _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1616
2.69k
      }
1617
29.9k
    }
1618
  // Now assigning out the rest of alias refs.
1619
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1620
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1621
91.2k
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_blocks[i].alias_ref61.3k
&&
!tensor_arena->vt_tensors[i]167
)
1622
164
    {
1623
164
      int ref = tensor_blocks[i].alias_ref - 1;
1624
164
      assert(tensor_arena->vt_tensors[ref]);
1625
164
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1626
164
    }
1627
  // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1628
6.27k
  
for (i = 0; 6.22k
i < tensor_arena->sub_arena_size;
i++50
)
1629
50
    if (tensor_arena->sub_arenas[i])
1630
49
    {
1631
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1632
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1633
138
      for (j = 0; j < node->input_size; 
j++89
)
1634
89
      {
1635
89
        const int idx = node->inputs[j];
1636
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1637
89
        if (s_idx < 0)
1638
23
          continue;
1639
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1640
        // Only do the replacement if it is a multi-view tensor.
1641
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1642
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor) &&
!18
TENSOR_EXPECT_UNASSIGNED18
(tensor_blocks[idx]))
1643
18
        {
1644
          // It cannot be binded tensor.
1645
18
          assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]));
1646
18
          const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1647
18
          const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1648
18
          ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? 
sub_arena_out_tensors[idx]1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)17
;
1649
          // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1650
          // to this tensor.
1651
18
          if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor))
1652
6
          {
1653
6
            const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1654
6
            ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1655
6
            ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? 
vt_tensor1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)5
);
1656
6
            ref_tensor->alias_ref = is_sub_arena_out_tensor ? 
(uintptr_t)vt_tensor1
:
(uintptr_t)vt_pos5
;
1657
6
            ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1658
6
            ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0]) ? 
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)5
CCV_NNC_MULTIVIEW_DATA5
(multiview)[0]) :
CCV_NNC_MULTIVIEW_DATA1
(multiview)[0]1
);
1659
6
            while (CCV_IS_TENSOR_MULTIVIEW(tv))
1660
0
              tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]);
1661
6
            *ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1662
6
            ref_tensor->data = tv->data;
1663
6
            ref_tensor->dataof = tv->dataof;
1664
6
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1665
6
          } else
1666
12
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? 
vt_tensor0
: (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1667
18
        }
1668
66
      }
1669
49
    }
1670
  // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1671
  // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1672
  // when initialize case..of node, which will take the phi multi-view again.
1673
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1674
91.2k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1675
10
    {
1676
10
      assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]));
1677
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1678
10
      assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI);
1679
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1680
10
    }
1681
  // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1682
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1683
91.2k
    if (tensor_arena->vt_tensors[i])
1684
82.9k
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1685
  // Associate multiview tensors from sub arena to the parent.
1686
6.22k
  if (sub_arena_out_tensors)
1687
29
  {
1688
240
    for (i = 0; i < alloc_prep->block_size; 
i++211
)
1689
211
      if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1690
111
      {
1691
111
        const int block_ref = alloc_prep->blocks[i].block_ref;
1692
111
        if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))
1693
0
          continue;
1694
111
        int sub_arena_ref = block_ref;
1695
111
        if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1696
10
        {
1697
          // Assigning out the tensor aliases.
1698
10
          assert(tensor_symbol_info[block_ref].alias_ref);
1699
10
          const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1700
          // It referenced to is not an alias.
1701
10
          assert(tensor_arena->vt_tensors[alias_ref]);
1702
10
          sub_arena_ref = alias_ref;
1703
10
          if (!sub_arena_out_tensors[sub_arena_ref])
1704
3
            continue;
1705
10
        }
1706
108
        if (!sub_arena_out_tensors[sub_arena_ref])
1707
84
          continue;
1708
24
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref]) ? 
sub_arena_out_tensors[sub_arena_ref]23
:
(ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref1
);
1709
24
        assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1710
        // This is only possible if the vt_tensors is a phi node.
1711
24
        if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1712
0
        {
1713
          // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1714
0
          ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1715
0
          assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI);
1716
0
          assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]));
1717
0
          CCV_NNC_MULTIVIEW_DATA(phi)[1]->alias_ref = (uintptr_t)mv;
1718
0
          ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)[1]);
1719
24
        } else {
1720
24
          tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1721
24
          ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1722
24
        }
1723
24
      }
1724
29
  }
1725
  // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1726
  // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1727
  // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1728
  // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1729
  // to the output of assign_ref tensor.
1730
97.5k
  
for (i = 0; 6.22k
i < tensor_symbol_info_size;
i++91.2k
)
1731
91.2k
    if (tensor_arena->vt_tensors[i] && 
tensor_symbol_info[i].assign_ref82.9k
)
1732
25
    {
1733
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1734
25
      ccv_nnc_tensor_t* assign_tensor;
1735
25
      if (sub_arena_out_tensors && 
sub_arena_out_tensors[assign_ref]3
)
1736
0
        assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref]) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1737
25
      else
1738
25
        assign_tensor = tensor_arena->vt_tensors[assign_ref];
1739
25
      ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1740
25
    }
1741
  // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1742
53.8k
  for (i = 0; i < tensor_bind_size; 
i++47.5k
)
1743
47.5k
  {
1744
47.5k
    assert(tensor_binds[i].tensor);
1745
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1746
47.5k
    if (resolved_symbol.d >= 0)
1747
47.5k
    {
1748
47.5k
      int d = resolved_symbol.d;
1749
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1750
      // It has nothing to do with alias.
1751
47.7k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref46.7k
)
1752
146
        d = tensor_blocks[d].ref - 1;
1753
      // Note we don't trace back on alias. This is intentional.
1754
47.5k
      assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1755
47.5k
    }
1756
47.5k
  }
1757
6.22k
  if (sub_arena_out_tensors)
1758
29
    ccfree(sub_arena_out_tensors);
1759
  // Rewire sub arena's tensor references.
1760
6.27k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1761
50
    if (tensor_arena->sub_arenas[i])
1762
49
    {
1763
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1764
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1765
138
      for (j = 0; j < node->input_size; 
j++89
)
1766
89
      {
1767
89
        const int idx = node->inputs[j];
1768
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1769
89
        if (s_idx < 0)
1770
23
          continue;
1771
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1772
        // Only do the replacement if it is a multi-view tensor.
1773
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1774
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor))
1775
18
        {
1776
          // This is binded tensor, bind it now.
1777
18
          if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx]))
1778
0
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1779
18
          else
1780
18
            _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1781
18
        }
1782
66
      }
1783
49
    }
1784
6.22k
  return tensor_arena;
1785
6.22k
}
1786
1787
static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1788
17
{
1789
17
  assert(graph);
1790
17
  if ((intptr_t)graph == tensor_arena->graph_ref)
1791
7
  {
1792
7
    assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size);
1793
7
    return tensor_arena->vt_tensors[pair_ref];
1794
7
  }
1795
10
  int i;
1796
13
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++3
)
1797
10
    if (tensor_arena->sub_arenas[i])
1798
10
    {
1799
10
      ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1800
10
      if (tensor)
1801
7
        return tensor;
1802
10
    }
1803
3
  return 0;
1804
10
}
1805
1806
static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1807
7
{
1808
7
  if (!CCV_IS_TENSOR_MULTIVIEW(tensor))
1809
5
    tensor->type |= CCV_TAPE_ALLOC;
1810
2
  else {
1811
2
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1812
2
    mv->type |= CCV_TAPE_ALLOC;
1813
2
    int i;
1814
5
    for (i = 0; i < mv->repeat + mv->kind; 
i++3
)
1815
3
      _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1816
2
  }
1817
7
}
1818
1819
static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1820
6.22k
{
1821
6.22k
  assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
1822
6.22k
  int i;
1823
97.5k
  for (i = 0; i < graph_prep->tensor_symbol_info_size; 
i++91.2k
)
1824
91.2k
  {
1825
91.2k
    if (graph_prep->tensor_symbol_info[i].pair_ref)
1826
7
    {
1827
7
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1828
      // No need to continue check this if it is from its pair.
1829
7
      continue;
1830
7
    }
1831
91.2k
    if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && 
tensor_arena->vt_tensors[i]7
)
1832
7
    {
1833
      // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1834
7
      if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i]))
1835
5
      {
1836
5
        const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1837
5
        if (vt_ref >= 0 &&
1838
5
          TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref]) == READ_ONLY)
1839
3
          continue;
1840
5
      }
1841
4
      _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1842
4
    }
1843
91.2k
  }
1844
6.27k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
1845
50
    if (graph_prep->sub_preps[i])
1846
49
      _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1847
6.22k
}
1848
1849
static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1850
128k
{
1851
128k
  int i, found = 0;
1852
  // Try to insert head.
1853
128k
  ccv_array_t* head = tensor_blocks.head;
1854
128k
  assert(head);
1855
130k
  
for (i = 0; 128k
i < head->rnum;)
1856
60.7k
  {
1857
60.7k
    const int head_idx = *(int*)ccv_array_get(head, i);
1858
60.7k
    if (head_idx == idx)
1859
118
    {
1860
118
      found = 1;
1861
118
      break;
1862
118
    }
1863
60.6k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1864
60.6k
    if (cell.i32 && 
cell.i32[0] > 041
)
1865
41
    {
1866
      /* If the current node is the parent of the head node, check if we found it or not. */
1867
      /* If not found, replace the current one. */
1868
41
      if (!found)
1869
41
      {
1870
41
        found = 1;
1871
41
        *(int*)ccv_array_get(head, i) = idx;
1872
41
      } else {
1873
        /* Remove the current one, change the rnum. */
1874
0
        if (i < head->rnum - 1)
1875
0
          *(int*)ccv_array_get(head, i) = *(int*)ccv_array_get(head, head->rnum - 1);
1876
0
        --head->rnum;
1877
0
        continue;
1878
0
      }
1879
60.6k
    } else {
1880
      // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1881
60.6k
      cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1882
60.6k
      if (cell.i32 && 
cell.i32[0] > 058.4k
)
1883
58.4k
      {
1884
58.4k
        found = 1;
1885
58.4k
        break;
1886
58.4k
      }
1887
60.6k
    }
1888
    /* Advancing i. */
1889
2.17k
    ++i;
1890
2.17k
  }
1891
  /* If not found, push this idx to the end of the array. */
1892
128k
  if (!found)
1893
70.1k
    ccv_array_push(head, &idx);
1894
  // Try to insert tail.
1895
128k
  found = 0;
1896
128k
  ccv_array_t* tail = tensor_blocks.tail;
1897
128k
  assert(tail);
1898
186k
  
for (i = 0; 128k
i < tail->rnum;)
1899
61.9k
  {
1900
61.9k
    const int tail_idx = *(int*)ccv_array_get(tail, i);
1901
61.9k
    if (tail_idx == idx)
1902
4.48k
    {
1903
4.48k
      found = 1;
1904
4.48k
      break;
1905
4.48k
    }
1906
57.4k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1907
57.4k
    if (cell.i32 && 
cell.i32[0] > 055.1k
)
1908
55.1k
    {
1909
      /* If the current node is the child of the tail node, check if we found it or not. */
1910
      /* If not found, replace the current one. */
1911
55.1k
      if (!found)
1912
53.9k
      {
1913
53.9k
        found = 1;
1914
53.9k
        *(int*)ccv_array_get(tail, i) = idx;
1915
53.9k
      } else {
1916
        /* Remove the current one, change the rnum. */
1917
1.13k
        *(int*)ccv_array_get(tail, i) = *(int*)ccv_array_get(tail, tail->rnum - 1);
1918
1.13k
        --tail->rnum;
1919
1.13k
        continue;
1920
1.13k
      }
1921
55.1k
    } else {
1922
      // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1923
2.37k
      cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1924
2.37k
      if (cell.i32 && 
cell.i32[0] > 0110
)
1925
110
      {
1926
110
        found = 1;
1927
110
        break;
1928
110
      }
1929
2.37k
    }
1930
    /* Advancing i. */
1931
56.2k
    ++i;
1932
56.2k
  }
1933
  /* If not found, push this idx to the end of the array. */
1934
128k
  if (!found)
1935
70.2k
    ccv_array_push(tail, &idx);
1936
128k
}
1937
1938
ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1939
7.00k
{
1940
7.00k
  if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1941
6.90k
  {
1942
6.90k
    assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size);
1943
6.90k
    ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1944
6.90k
    if (tensor && 
CCV_IS_TENSOR_MULTIVIEW6.90k
(tensor))
1945
11
    {
1946
11
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1947
22
      while (CCV_IS_TENSOR_MULTIVIEW(mv))
1948
11
        mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? 
mv->it1
:
CCV_NNC_MULTIVIEW_DATA10
(mv)[0]10
);
1949
11
      return (ccv_nnc_tensor_t*)mv;
1950
11
    }
1951
6.89k
    return tensor;
1952
6.90k
  }
1953
100
  int i;
1954
123
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++23
)
1955
99
    if (tensor_arena->sub_arenas[i])
1956
99
    {
1957
99
      ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1958
99
      if (tensor)
1959
76
        return tensor;
1960
99
    }
1961
24
  return 0;
1962
100
}
1963
1964
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1965
66.6k
{
1966
66.6k
  if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1967
66.6k
  {
1968
66.6k
    assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size);
1969
66.6k
    return graph_exec_arena->graph_execs[symbol.d];
1970
66.6k
  }
1971
7
  int i;
1972
9
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++2
)
1973
7
    if (graph_exec_arena->sub_arenas[i])
1974
7
    {
1975
7
      ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1976
7
      if (!CCV_NO_GRAPH_EXEC(exec))
1977
5
        return exec;
1978
7
    }
1979
2
  return (ccv_nnc_graph_exec_t){}; // 0.
1980
7
}
1981
1982
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1983
9
{
1984
9
  return graph_exec_arena->source;
1985
9
}
1986
1987
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1988
9
{
1989
9
  return graph_exec_arena->destination;
1990
9
}
1991
1992
// Check whether the head is the beginning of this block.
1993
static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1994
50
{
1995
50
  assert(tensor_block->head);
1996
50
  return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0) == head_node);
1997
50
}
1998
1999
// Check whether the tail is the end of this block.
2000
static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2001
39
{
2002
39
  assert(tensor_block->tail);
2003
39
  return (tensor_block->tail->rnum == 1 && 
*(int*)36
ccv_array_get36
(tensor_block->tail, 0) == tail_node);
2004
39
}
2005
2006
// Make two tensor blocks one. Return 1 if that happened.
2007
static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2008
6.69k
{
2009
  // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2010
6.69k
  if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]) &&
2011
6.69k
    
(6.66k
!6.66k
TENSOR_IS_UNFOLDABLE_AS_OUTPUT6.66k
(tensor_blocks[p_ref_1]) ||
tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 118
) &&
2012
6.69k
    
tensor_blocks[p_ref_0].tail->rnum == 16.64k
&&
2013
6.69k
    
tensor_blocks[p_ref_1].head->rnum == 16.64k
&&
2014
6.69k
    
tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type6.64k
&& // Must be the same type.
2015
6.69k
    
*(int*)6.63k
ccv_array_get6.63k
(tensor_blocks[p_ref_0].tail, 0) == *(int*)
ccv_array_get6.63k
(tensor_blocks[p_ref_1].head, 0))
2016
6.38k
  {
2017
    // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2018
6.38k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]));
2019
6.38k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]));
2020
6.38k
    ccv_array_free(tensor_blocks[p_ref_0].tail);
2021
6.38k
    tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2022
6.38k
    if (tensor_blocks[p_ref_1].p_refs[0])
2023
14
    {
2024
14
      assert(tensor_blocks[p_ref_1].p_refs[1] == 0); // It simply cannot have more than one p_refs, otherwise we cannot merge.
2025
14
      if (!tensor_blocks[p_ref_0].p_refs[0])
2026
10
        tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2027
4
      else
2028
4
        tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2029
14
    }
2030
6.38k
    tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2031
6.38k
    TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]));
2032
6.38k
    ccv_array_free(tensor_blocks[p_ref_1].head);
2033
6.38k
    if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1]))
2034
16
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]);
2035
    // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2036
6.38k
    TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1]);
2037
6.38k
    tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2038
6.38k
    if (!tensor_blocks[p_ref_0].r_refs)
2039
6.20k
      tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2040
6.38k
    ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2041
6.38k
    tensor_blocks[p_ref_1].size = 0;
2042
6.38k
    tensor_blocks[p_ref_1].head = 0;
2043
6.38k
    tensor_blocks[p_ref_1].tail = 0;
2044
6.38k
    return 1;
2045
6.38k
  }
2046
312
  return 0;
2047
6.69k
}
2048
2049
static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2050
6.23k
{
2051
6.23k
  int i, j, k;
2052
  // Generate exec dependencies (or, in other words, partial ordering of executions).
2053
6.23k
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2054
6.23k
  int* buf = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2055
6.23k
  int buf_size;
2056
6.23k
  if (p_node_info)
2057
62
    { assert(output_size == 0); }
2058
6.23k
#define for_block(x, val) \
2059
212k
  do { \
2060
212k
    if (((int32_t*)val)[0] > 0) \
2061
212k
    { \
2062
212k
      buf[buf_size * 2] = x; \
2063
212k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2064
212k
      ++buf_size; \
2065
212k
    } \
2066
212k
  } while (0)
2067
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
2068
32.1k
    buf_size = 0; /* save all its parent deps to this buffer */
2069
32.1k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2070
32.1k
    if (vector)
2071
212k
      
CCV_SPARSE_VECTOR_FOREACH25.6k
(exec_dep, vector, for_block);
2072
32.1k
    if (!node->outgoings)
2073
6.90k
      continue;
2074
53.3k
    
for (i = 0; 25.2k
i < node->outgoings->rnum;
i++28.0k
)
2075
28.0k
    {
2076
28.0k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
2077
28.0k
      const int32_t one = 1;
2078
28.0k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2079
      /* If not found, set, if the current node is the destination node, no need 
2080
       * set itself as parent of subsequent nodes because its terminal nature. */
2081
28.0k
      if (!cell.i32 || 
cell.i32[0] == 00
)
2082
28.0k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2083
28.0k
      if (buf_size > 0)
2084
22.6k
      {
2085
22.6k
        ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2086
22.6k
        assert(vector);
2087
257k
        
for (j = 0; 22.6k
j < buf_size;
j++234k
) /* set with all idx's dependencies as well */
2088
234k
        {
2089
234k
          ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2090
          /* If not found, set */
2091
234k
          if (!cell.i32 || 
cell.i32[0] == 030.8k
)
2092
203k
            ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2093
30.8k
          else {
2094
            /* Otherwise, set to the longest one */
2095
30.8k
            int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
2096
30.8k
            ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2097
30.8k
          }
2098
234k
        }
2099
22.6k
      }
2100
28.0k
    }
2101
25.2k
  } ccv_nnc_graph_visit_endfor
2102
6.23k
#undef for_block
2103
6.23k
  ccfree(buf);
2104
  // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2105
6.23k
  const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2106
6.23k
  ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2107
  // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2108
  // happens that I have to loop through all relevant node to find out if one is used or not.
2109
97.6k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++91.4k
)
2110
91.4k
    tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2111
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2112
123k
    for (i = 0; i < node->input_size; 
i++90.8k
)
2113
90.8k
      if (node->inputs[i] >= 0)
2114
64.3k
      {
2115
64.3k
        tensor_blocks[node->inputs[i]].flags = 0;
2116
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2117
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2118
64.3k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
2119
64.3k
          
(56.3k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD56.3k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD56.3k
))
2120
18
          tensor_blocks[node->inputs[i]].pin_mem = 1;
2121
64.3k
      }
2122
82.5k
    for (i = 0; i < node->output_size; 
i++50.3k
)
2123
50.3k
      if (node->outputs[i] >= 0)
2124
41.4k
      {
2125
41.4k
        tensor_blocks[node->outputs[i]].flags = 0;
2126
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2127
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2128
41.4k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
2129
41.4k
          
(36.0k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD36.0k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD36.0k
))
2130
16
          tensor_blocks[node->outputs[i]].pin_mem = 1;
2131
41.4k
      }
2132
32.1k
  } ccv_nnc_graph_visit_endfor
2133
6.23k
  if (p_node_info)
2134
62
  {
2135
62
    assert(p_tensor_symbol_info);
2136
    // Mark it as used if it is used in either input or output.
2137
165
    
for (i = 0; 62
i < p_node_info->input_size;
i++103
)
2138
103
      if (p_node_info->inputs[i] >= 0)
2139
103
      {
2140
103
        const int d = p_node_info->inputs[i];
2141
103
        if (p_tensor_symbol_info[d].s_ref && 
p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx101
)
2142
92
        {
2143
92
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
2144
92
          if (dd >= 0) // If this exists in this sub-graph, great.
2145
80
            tensor_blocks[dd].flags = 0;
2146
92
        }
2147
103
      }
2148
132
    for (i = 0; i < p_node_info->output_size; 
i++70
)
2149
70
      if (p_node_info->outputs[i] >= 0)
2150
70
      {
2151
70
        const int d = p_node_info->outputs[i];
2152
70
        if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2153
70
        {
2154
70
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
2155
70
          if (dd >= 0) // If this exists in this sub-graph, great.
2156
70
            tensor_blocks[dd].flags = 0;
2157
70
        }
2158
70
      }
2159
62
  }
2160
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2161
91.4k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
2162
70.6k
    {
2163
      // Check no tensor info is auto now.
2164
70.6k
      assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info));
2165
      // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2166
      // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2167
      // fold to).
2168
70.6k
      if (tensor_symbol_info[i].assign_ref)
2169
40
      {
2170
        // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2171
        // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2172
        // it kept its own representation, which is not the case for output).
2173
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
2174
40
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2175
        // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2176
40
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref]);
2177
        // It also cannot be folded as output (except i), because we need to keep its own representation.
2178
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref]);
2179
40
        assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0);
2180
40
        tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2181
63
        for (j = 0; j < unroll_count; 
j++23
)
2182
23
        {
2183
23
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
2184
23
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
2185
23
        }
2186
40
        if (tensor_blocks[assign_ref].bypass_ref)
2187
4
        {
2188
          // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2189
4
          tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2190
4
          const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2191
4
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref]);
2192
4
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref]);
2193
          // On the other hand, it can be folded into the except_ref for the bypass_ref.
2194
4
          tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2195
4
          if (dup_tensor_from_ref)
2196
2
          {
2197
2
            const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2198
2
            if (bypass_from_ref >= 0)
2199
2
            {
2200
2
              TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref]);
2201
2
              TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref]);
2202
2
              assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref);
2203
2
              for (j = 0; j < unroll_count - 1; 
j++0
)
2204
0
              {
2205
                // Mark every incarnation as unfold-able.
2206
0
                TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
2207
0
                TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
2208
0
              }
2209
2
            }
2210
2
          }
2211
4
        }
2212
40
      }
2213
70.6k
    }
2214
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2215
91.4k
  {
2216
    // If it has a pair reference, we don't need to allocate this tensor at all,
2217
    // set it to be unassigned.
2218
91.4k
    if (tensor_symbol_info[i].pair_ref)
2219
15
      TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i]);
2220
    // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2221
91.4k
    else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2222
7
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2223
7
      TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
2224
      // For this case, there is no exception.
2225
7
      tensor_blocks[i].unfoldable_except_ref = 0;
2226
91.3k
    } else if (tensor_symbol_info[i].p_ref) {
2227
119
      assert(p_node_info);
2228
119
      const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2229
      // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2230
119
      if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2231
        // TODO: This check can be lifted if we can fold in the parent graph.
2232
48
        if (-1 == p_ref_is_in_or_out)
2233
20
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2234
119
      if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2235
68
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2236
119
    }
2237
91.4k
  }
2238
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2239
91.4k
  {
2240
91.4k
    if (tensor_symbol_info[i].alias_ref)
2241
3.26k
    {
2242
3.26k
      const int ref = tensor_symbol_info[i].alias_ref - 1;
2243
      // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2244
3.26k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
!1.58k
TENSOR_EXPECT_UNASSIGNED1.58k
(tensor_blocks[i]))
2245
1.04k
        tensor_blocks[ref].flags = 0;
2246
      // An alias cannot ref to another alias.
2247
3.26k
      assert(!tensor_symbol_info[ref].alias_ref);
2248
3.26k
      tensor_blocks[i].flags = ALIAS;
2249
3.26k
      tensor_blocks[i].ref = ref + 1; // Assign the ref.
2250
3.26k
      if (!tensor_blocks[ref].r_refs)
2251
3.22k
        tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2252
3.26k
      ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2253
3.26k
    }
2254
91.4k
  }
2255
  // Scan again and if the ref is not assigned, mark the alias not assigned.
2256
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2257
91.4k
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
2258
3.26k
    {
2259
3.26k
      const int ref = tensor_blocks[i].ref - 1;
2260
3.26k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]))
2261
539
      {
2262
        // Mark this as unassigned.
2263
539
        tensor_blocks[i].flags = UNASSIGNED;
2264
539
        tensor_blocks[i].ref = 0;
2265
539
      }
2266
3.26k
    }
2267
97.6k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++91.4k
)
2268
91.4k
  {
2269
    // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2270
91.4k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
2271
68.9k
    {
2272
68.9k
      tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2273
68.9k
      tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2274
      // Cache tensor size (align to 16 bytes).
2275
68.9k
      tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2276
68.9k
    }
2277
    // If there is a p_ref, add the one to the p_refs list.
2278
91.4k
    if (tensor_symbol_info[i].p_ref)
2279
128
      tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2280
91.4k
  }
2281
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2282
123k
    for (i = 0; i < node->input_size; 
i++90.8k
)
2283
90.8k
    {
2284
90.8k
      int d = node->inputs[i];
2285
90.8k
      if (d < 0)
2286
26.5k
        continue;
2287
64.3k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2288
1.57k
        d = tensor_symbol_info[d].alias_ref - 1;
2289
64.3k
      tensor_blocks[d].flags |= READ_ONLY;
2290
64.3k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2291
15
        continue;
2292
64.3k
      assert
(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))64.3k
;
2293
      /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2294
       * from the very beginning of the graph life-cycle and ends here. */
2295
64.3k
      if (tensor_blocks[d].head->rnum == 0 && 
!27.5k
TENSOR_REQUIRE_INIT27.5k
(tensor_symbol_info[d].flags))
2296
27.4k
      {
2297
87.2k
        for (j = 0; j < source_size; 
j++59.7k
)
2298
59.7k
        {
2299
          // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2300
59.7k
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2301
59.7k
          if (cell.i32 && 
cell.i32[0] > 022.7k
)
2302
22.7k
            _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2303
59.7k
        }
2304
        /* If this is a read-only (based on SSA, if first encountered as read), and this is
2305
         * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2306
         * loop, however, in that case, you need to prevent read-only gets reused for the
2307
         * output tensor, which is not obvious how to implement correctly), and it is not
2308
         * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2309
         * of memory anyway (because on second loop, we want to read the same value out).
2310
         * Mark it to the end of the graph. */
2311
27.4k
        if (p_node_info && 
!tensor_symbol_info[d].assign_ref146
)
2312
210
          
for (j = 0; 105
j < destination_size;
j++105
)
2313
105
          {
2314
            // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2315
105
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2316
105
            if (cell.i32 && 
cell.i32[0] > 065
)
2317
65
              _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2318
105
          }
2319
27.4k
      }
2320
64.3k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2321
64.3k
    }
2322
82.5k
    
for (i = 0; 32.1k
i < node->output_size;
i++50.3k
)
2323
50.3k
    {
2324
50.3k
      int d = node->outputs[i];
2325
50.3k
      if (d < 0)
2326
8.91k
        continue;
2327
41.4k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2328
1.36k
        d = tensor_symbol_info[d].alias_ref - 1;
2329
41.4k
      tensor_blocks[d].flags |= WRITE_ONLY;
2330
41.4k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2331
0
        continue;
2332
41.4k
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2333
41.4k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2334
41.4k
    }
2335
32.1k
  } ccv_nnc_graph_visit_endfor
2336
  // For any assign_ref, its life-time kept until the end and wrap over.
2337
97.6k
  
for (i = 0; 6.23k
i < symbolic_graph->tensor_symbol_info->rnum;
i++91.4k
)
2338
    // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2339
    // that "somewhere else" need to keep its life-time til the end.
2340
91.4k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) &&
2341
91.4k
      
p_node_info68.9k
&&
tensor_symbol_info[i].assign_ref282
)
2342
42
    {
2343
42
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2344
84
      for (j = 0; j < destination_size; 
j++42
)
2345
42
      {
2346
        // This logic is to be more conservative about which destination we add to.
2347
        // As of now, if we add everything, it is fine most likely. However, it may
2348
        // cause issues in the future to do so naively. Thus, instead, we only add
2349
        // the destination to it iff either the tensor is not used at all, or, the
2350
        // destination is on the same stream as of the tensor block some way.
2351
42
        int flag = !tensor_blocks[assign_ref].tail;
2352
83
        for (k = 0; !flag && 
k < tensor_blocks[assign_ref].tail->rnum73
;
k++41
)
2353
41
        {
2354
41
          const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k);
2355
41
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2356
41
          flag = (cell.i32 && 
cell.i32[0] > 010
);
2357
41
        }
2358
42
        if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2359
10
          _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2360
42
      }
2361
42
    }
2362
6.33k
  for (i = 0; i < output_size; 
i++99
)
2363
99
  {
2364
99
    assert(outputs[i].graph == symbolic_graph);
2365
99
    int d = outputs[i].d;
2366
99
    if (d < 0)
2367
0
      continue;
2368
99
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2369
0
      d = tensor_symbol_info[d].alias_ref - 1;
2370
99
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2371
0
      continue;
2372
99
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2373
361
    
for (j = 0; 99
j < destination_size;
j++262
)
2374
262
    {
2375
262
      int flag = !tensor_blocks[d].tail;
2376
524
      for (k = 0; !flag && 
k < tensor_blocks[d].tail->rnum492
;
k++262
)
2377
262
      {
2378
262
        const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
2379
262
        const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2380
262
        flag = (cell.i32 && 
cell.i32[0] > 032
);
2381
262
      }
2382
262
      if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2383
32
        _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2384
262
    }
2385
99
  }
2386
  // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2387
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2388
32.1k
    int x, y;
2389
123k
    for (x = 0; x < node->input_size; 
x++90.8k
)
2390
260k
      
for (y = 0; 90.8k
y < node->output_size;
y++169k
)
2391
        /* Some operations enforces some tensors to be the same for inputs / outputs. */
2392
169k
        if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2393
180
        {
2394
          // If both unassigned, it is fine.
2395
180
          if (node->inputs[x] < 0 && 
node->outputs[y] < 00
)
2396
0
            continue;
2397
180
          int ref = node->inputs[x];
2398
180
          assert(ref >= 0);
2399
180
          while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref0
)
2400
0
            ref = tensor_blocks[ref].ref - 1;
2401
180
          const int node_output_y = node->outputs[y];
2402
180
          assert(node_output_y >= 0);
2403
          // If both are not computable, it is fine, we don't need to enforce.
2404
180
          if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2405
180
            
!0
TENSOR_EXPECT_COMPUTABLE0
(tensor_blocks[node_output_y]))
2406
0
            continue;
2407
          // Otherwise, enforce and error out if failed.
2408
180
          if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2409
0
            { assert(0 && "cannot enforce inplace for the two tensors"); }
2410
180
        }
2411
32.1k
  } ccv_nnc_graph_visit_endfor
2412
  // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2413
  // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2414
  // that is not enforced in-place (because the tensor enforced in-place will be different than the
2415
  // binding one).
2416
53.8k
  
for (i = 0; 6.23k
i < tensor_bind_size;
i++47.5k
)
2417
47.5k
  {
2418
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2419
    // If there is a tensor binded, then it is unassigned.
2420
47.5k
    if (resolved_symbol.d >= 0)
2421
47.5k
    {
2422
47.5k
      int d = resolved_symbol.d;
2423
      // I cannot assert too much at this moment.
2424
47.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2425
1.02k
        d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2426
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
2427
      // It has nothing to do with alias.
2428
47.7k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref12.5k
)
2429
146
        d = tensor_blocks[d].ref - 1;
2430
      // Doesn't work if this is a loop carrying variable.
2431
47.5k
      assert(!tensor_symbol_info[d].assign_ref);
2432
47.5k
      tensor_blocks[d].flags = UNASSIGNED;
2433
47.5k
      tensor_blocks[d].ref = 0; // No need to have ref as well.
2434
47.5k
    }
2435
47.5k
  }
2436
  // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2437
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2438
32.1k
    int x, y;
2439
123k
    for (x = 0; x < node->input_size; 
x++90.8k
)
2440
90.8k
    {
2441
      /* If the input is not assigned, it can be referenced, find the referenced one */
2442
90.8k
      int ref = node->inputs[x];
2443
90.8k
      if (ref < 0)
2444
26.5k
        continue;
2445
64.3k
      const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2446
71.6k
      while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref38.3k
)
2447
7.27k
        ref = tensor_blocks[ref].ref - 1;
2448
64.3k
      assert(tensor_blocks[ref].ref == 0);
2449
64.3k
      if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2450
64.3k
        
tensor_blocks[ref].tail->rnum == 133.2k
)
2451
33.0k
      {
2452
86.7k
        for (y = 0; y < node->output_size; 
y++53.6k
)
2453
          /* Only proceed if the input symbol is different from the output symbol, */
2454
          /* and the input symbol meets the output symbol exactly at the same spot. */
2455
53.6k
          if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2456
53.6k
            
node->outputs[y] >= 013.4k
&&
2457
53.6k
            
ref != node->outputs[y]13.4k
&&
2458
53.6k
            
TENSOR_EXPECT_COMPUTABLE13.4k
(tensor_blocks[node->outputs[y]]))
2459
6.52k
          {
2460
6.52k
            const int node_output_y = node->outputs[y];
2461
6.52k
            const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2462
            /* If dimension matches perfectly, then we can assign y_symbol to x.
2463
             * If both of them are aliases, making sure their origin matches in size too. */
2464
6.52k
            if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
2465
6.51k
            {
2466
6.51k
              _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2467
              // This refers to an alias itself, now mark it and will be processed later.
2468
6.51k
              if (ref != node->inputs[x])
2469
290
                tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2470
6.51k
            }
2471
6.52k
          }
2472
33.0k
      }
2473
64.3k
    }
2474
32.1k
  } ccv_nnc_graph_visit_endfor
2475
  // Specifically handle the bypass. This need to be done after the first pass.
2476
  // I need to extend the bypass life-time to the same as the one I am going with.
2477
  // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2478
6.23k
  ccv_nnc_tensor_block_t empty_block = {};
2479
6.23k
  empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2480
6.23k
  empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2481
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2482
32.1k
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2483
13
    {
2484
13
      int can_bypass = 1;
2485
28
      for (i = 0; can_bypass && 
i < node->output_size25
;
i++15
)
2486
15
      {
2487
15
        int d = node->outputs[i];
2488
15
        if (d < 0)
2489
0
          continue;
2490
15
        if (!tensor_blocks[d].bypass_ref)
2491
2
          continue;
2492
13
        while (tensor_blocks[d].ref)
2493
0
          d = tensor_blocks[d].ref - 1;
2494
13
        int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2495
14
        while (tensor_blocks[bypass_ref].ref)
2496
1
          bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2497
        // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2498
13
        if (!tensor_symbol_info[bypass_ref].assign_ref && 
!tensor_symbol_info[bypass_ref].r_assign_ref10
)
2499
10
          continue;
2500
3
        ccv_array_clear(empty_block.head);
2501
6
        for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; 
j++3
)
2502
3
          ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j));
2503
3
        ccv_array_clear(empty_block.tail);
2504
6
        for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; 
j++3
)
2505
3
          ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j));
2506
6
        for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++3
)
2507
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), empty_block);
2508
6
        for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++3
)
2509
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), empty_block);
2510
        // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2511
3
        assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref));
2512
3
        int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : 
tensor_symbol_info[bypass_ref].r_assign_ref - 10
;
2513
3
        while (tensor_blocks[b_ref].ref)
2514
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2515
3
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2516
3
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2517
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2518
        // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2519
3
        can_bypass = can_bypass && (a_hop_b || b_hop_a);
2520
3
      }
2521
13
      if (can_bypass)
2522
10
      {
2523
22
        for (i = 0; i < node->output_size; 
i++12
)
2524
12
        {
2525
12
          int d = node->outputs[i];
2526
12
          if (d < 0)
2527
0
            continue;
2528
12
          if (!tensor_blocks[d].bypass_ref)
2529
2
            continue;
2530
10
          while (tensor_blocks[d].ref)
2531
0
            d = tensor_blocks[d].ref - 1;
2532
10
          int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2533
10
          while (tensor_blocks[bypass_ref].ref)
2534
0
            bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2535
          // The bypass_ref can extend its life-time.
2536
20
          for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++10
)
2537
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), tensor_blocks[bypass_ref]);
2538
20
          for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++10
)
2539
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), tensor_blocks[bypass_ref]);
2540
10
        }
2541
10
      } else {
2542
6
        for (i = 0; i < node->output_size; 
i++3
)
2543
3
          tensor_blocks[node->outputs[i]].bypass_ref = 0;
2544
3
        const int exec_idx = (dup_exec_from_ref) ? 
dup_exec_from_ref[idx]1
:
idx2
;
2545
        // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2546
3
        exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2547
3
      }
2548
13
    }
2549
32.1k
  } ccv_nnc_graph_visit_endfor
2550
6.23k
  ccv_array_free(empty_block.head);
2551
6.23k
  ccv_array_free(empty_block.tail);
2552
6.23k
  *r_exec_dep = exec_dep;
2553
6.23k
  *r_tensor_blocks = tensor_blocks;
2554
6.23k
}
2555
2556
static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2557
33
{
2558
33
  if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
cmd.cmd == CCV_NNC_GRAPH_BACKWARD30
)
2559
3
  {
2560
3
    ccv_nnc_cmd_t retval = cmd;
2561
3
    retval.cmd = CCV_NNC_NOOP;
2562
3
    return retval;
2563
3
  }
2564
30
  return cmd;
2565
33
}
2566
2567
static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2568
102
{
2569
102
  if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2570
47
  {
2571
47
    if (tensor_symbol_info[input].alias_ref)
2572
18
    {
2573
18
      const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2574
18
      assert(tensor_symbol_info[alias_ref].alias_ref == 0);
2575
18
      ccv_nnc_tensor_symbol_t tensor_symbol = {};
2576
18
      if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2577
6
      {
2578
6
        tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2579
6
        if (tensor_symbol_info[alias_ref].pair_ref)
2580
0
          ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2581
0
            .d = tensor_symbol_info[alias_ref].pair_ref - 1,
2582
0
            .graph = dup_graph->pair
2583
0
          });
2584
6
        ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2585
6
        dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2586
12
      } else {
2587
12
        tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2588
12
        tensor_symbol.graph = dup_graph;
2589
12
      }
2590
18
      ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2591
18
      if (tensor_symbol_info[input].pair_ref)
2592
0
        ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2593
0
          .d = tensor_symbol_info[input].pair_ref - 1,
2594
0
          .graph = dup_graph->pair
2595
0
        });
2596
18
      ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2597
18
      dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2598
29
    } else {
2599
29
      ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2600
29
      if (tensor_symbol_info[input].pair_ref)
2601
4
        ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2602
4
          .d = tensor_symbol_info[input].pair_ref - 1,
2603
4
          .graph = dup_graph->pair
2604
4
        });
2605
29
      ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2606
29
      dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2607
29
    }
2608
47
    if (tensor_symbol_info[input].bypass_ref)
2609
2
    {
2610
2
      const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2611
2
      assert(dup_bypass_ref >= 0);
2612
2
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count]);
2613
2
      symbol_info->bypass_ref = dup_bypass_ref + 1;
2614
2
    }
2615
47
  }
2616
102
  return (ccv_nnc_tensor_symbol_t) {
2617
102
    .d = dup_tensor_block_ref[input * unroll_count],
2618
102
    .graph = dup_graph,
2619
102
  };
2620
102
}
2621
2622
static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2623
72
{
2624
72
  int i;
2625
72
  if (dup_exec_ref[idx * unroll_count] < 0)
2626
44
  {
2627
    // Input has to come before output, because output could has a bypass reference to the input.
2628
116
    for (i = 0; i < node->input_size; 
i++72
)
2629
72
      max_inputs[i] = (node->inputs[i] >= 0) ? 
_ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i])71
:
(ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph }1
;
2630
75
    for (i = 0; i < node->output_size; 
i++31
)
2631
31
      max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : 
(ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph }0
;
2632
44
    ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2633
44
    dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2634
44
  }
2635
72
  return (ccv_nnc_graph_exec_symbol_t) {
2636
72
    .d = dup_exec_ref[idx * unroll_count],
2637
72
    .graph = dup_graph,
2638
72
  };
2639
72
}
2640
2641
static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2642
6.23k
{
2643
6.23k
  int i;
2644
97.7k
  for (i = 0; i < tensor_block_size; 
i++91.5k
)
2645
91.5k
  {
2646
91.5k
    if (tensor_blocks[i].head)
2647
62.6k
      ccv_array_free(tensor_blocks[i].head);
2648
91.5k
    if (tensor_blocks[i].tail)
2649
62.6k
      ccv_array_free(tensor_blocks[i].tail);
2650
91.5k
    if (tensor_blocks[i].r_refs)
2651
9.42k
      ccv_array_free(tensor_blocks[i].r_refs);
2652
91.5k
    if (tensor_blocks[i].dup_p_refs)
2653
22
      ccv_array_free(tensor_blocks[i].dup_p_refs);
2654
91.5k
  }
2655
6.23k
  ccfree(tensor_blocks);
2656
6.23k
}
2657
2658
// Find tensors that cannot be solved by co-allocating to the same location.
2659
static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2660
21
{
2661
21
  int i, j, unroll_count = 0;
2662
131
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++110
)
2663
110
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_symbol_info[i].assign_ref90
)
2664
25
    {
2665
      // This is is a parameter, thus, it has to be either an alias or used.
2666
25
      assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]));
2667
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2668
      // The parameter it assign to has to be either an alias or used.
2669
25
      assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]));
2670
      // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2671
      // If it is the same, we are good, no need to extend.
2672
25
      int a_ref = i;
2673
25
      while (tensor_blocks[a_ref].ref)
2674
0
        a_ref = tensor_blocks[a_ref].ref - 1;
2675
25
      int b_ref = assign_ref;
2676
31
      while (tensor_blocks[b_ref].ref)
2677
6
        b_ref = tensor_blocks[b_ref].ref - 1;
2678
25
      if (a_ref != b_ref)
2679
19
      {
2680
        // If any of the b's head is deterministically later than a's tail
2681
        // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2682
19
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2683
19
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2684
        // It cannot be that both i can hop to j can j can hop to i.
2685
19
        assert(!(a_hop_b > 0 && b_hop_a > 0));
2686
        // Can it be folded
2687
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2688
19
        if (a_hop_b || 
b_hop_a16
)
2689
3
        {
2690
3
          tensor_blocks[a_ref].companion_ref = b_ref + 1;
2691
3
          tensor_blocks[b_ref].companion_ref = a_ref + 1;
2692
3
          continue;
2693
3
        }
2694
16
        int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2695
20
        for (j = 0; c_ref >= 0; 
j++4
)
2696
4
        {
2697
4
          while (tensor_blocks[c_ref].ref)
2698
0
            c_ref = tensor_blocks[c_ref].ref - 1;
2699
4
          c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2700
4
        }
2701
16
        unroll_count = ccv_max(unroll_count, j + 1);
2702
16
      }
2703
25
    }
2704
  // Reset companion_ref if need to unroll.
2705
21
  if (unroll_count)
2706
91
    
for (j = 0; 13
j < symbolic_graph->tensor_symbol_info->rnum;
j++78
)
2707
78
      tensor_blocks[j].companion_ref = 0;
2708
21
  return unroll_count;
2709
21
}
2710
2711
static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2712
13
{
2713
13
  int i, j, n;
2714
  // The inout exec nodes, these are the nodes we are going to extend.
2715
13
  uint8_t* inout = (uint8_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2716
13
  int max_input_size = 0;
2717
13
  int max_output_size = 0;
2718
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2719
35
  {
2720
35
    max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size);
2721
35
    max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size);
2722
35
  }
2723
13
  ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)];
2724
13
  ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)];
2725
  // Doing graph expansion
2726
  // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2727
13
  assert(dup_graph->exec_symbol_info->rnum > 0);
2728
13
  assert(dup_graph->tensor_symbol_info->rnum > 0);
2729
88
#define INCOMING_NODE (1)
2730
28
#define OUTGOING_NODE (2)
2731
  // Unroll the graph n times.
2732
29
  
for (n = 0; 13
n < unroll_count;
n++16
)
2733
16
  {
2734
16
    int* const dup_exec_ref = r_dup_exec_ref + n;
2735
16
    const int* const prev_dup_tensor_block_ref = n > 0 ? 
r_dup_tensor_block_ref + (n - 1)3
:
013
;
2736
16
    int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2737
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2738
46
      dup_exec_ref[i * unroll_count] = -1;
2739
131
    for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++115
)
2740
115
    {
2741
      // If there is a assign_ref, that means I don't need to dup the tensor.
2742
115
      if (tensor_symbol_info[i].assign_ref)
2743
25
      {
2744
25
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2745
25
        dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? 
prev_dup_tensor_block_ref[assign_ref * unroll_count]8
:
assign_ref17
;
2746
90
      } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && 
TENSOR_READ_WRITE52
(tensor_blocks[i]) == READ_ONLY52
)
2747
      // If this is a read-only tensor block, no need to duplicate because the value never changes
2748
      // (note we handled assign_ref first), therefore, no need to generate duplicate.
2749
26
        dup_tensor_block_ref[i * unroll_count] = i;
2750
64
      else
2751
64
        dup_tensor_block_ref[i * unroll_count] = -1;
2752
115
    }
2753
    // Go through the original graph, make copies of the node if it is inout.
2754
44
    ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2755
44
      ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2756
44
      inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2757
44
      if (!node->outgoings)
2758
16
        continue;
2759
56
      
for (i = 0; 28
i < node->outgoings->rnum;
i++28
)
2760
28
      {
2761
28
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
2762
28
        inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2763
28
        ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2764
28
        ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2765
28
      }
2766
28
    } ccv_nnc_graph_visit_endfor
2767
    // Check the visitor are all marked as either incoming or outgoing.
2768
16
    const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2769
16
    const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2770
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2771
46
    {
2772
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2773
2
        continue;
2774
46
      assert
((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))44
;
2775
      // If this is pure incoming nodes, then I need to concat this one with all original destination node
2776
44
      if (inout[i] == INCOMING_NODE)
2777
32
        
for (j = 0; 16
j < dup_destination_size;
j++16
)
2778
16
        {
2779
16
          ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2780
16
            .d = dup_destinations[j].d,
2781
16
            .graph = dup_graph,
2782
16
          }, (ccv_nnc_graph_exec_symbol_t) {
2783
16
            .d = dup_exec_ref[i * unroll_count],
2784
16
            .graph = dup_graph,
2785
16
          });
2786
16
        }
2787
44
    }
2788
16
    if (dup_graph->destinations)
2789
16
      ccv_array_clear(dup_graph->destinations);
2790
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2791
46
    {
2792
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2793
2
        continue;
2794
44
      const int d = dup_exec_ref[i * unroll_count];
2795
44
      ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d);
2796
      // If this has no outgoing node, add to the destination.
2797
44
      if (!exec_symbol_info->outgoings || 
exec_symbol_info->outgoings->rnum == 028
)
2798
16
        ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2799
16
          .graph = dup_graph,
2800
16
          .d = d,
2801
16
        });
2802
44
    }
2803
16
  }
2804
13
#undef INCOMING_NODE
2805
13
#undef OUTGOING_NODE
2806
13
  ccfree(inout);
2807
13
}
2808
2809
static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2810
13
{
2811
13
  int i;
2812
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2813
    // Now can assign them (The dup) as companion.
2814
    // Get to the last one, which we will wrap over.
2815
78
    if (dup_tensor_symbol_info[i].assign_ref)
2816
17
    {
2817
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2818
17
      dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2819
17
      assert(dup_tensor_symbol_info[i].assign_ref);
2820
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2821
17
    }
2822
13
}
2823
2824
// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2825
// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2826
// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2827
static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2828
21
{
2829
21
  int i, j, k;
2830
45
  for (i = 0; i < p_node_info->output_size; 
i++24
)
2831
24
  {
2832
24
    const int d = p_node_info->outputs[i];
2833
24
    const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx) - 1;
2834
24
    if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref]))
2835
6
      continue;
2836
36
    
for (k = 0; 18
k < destination_size;
k++18
)
2837
18
      _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2838
    // Add the duplicated destinations to the tensor_block_ref.
2839
42
    for (j = 0; j < unroll_count; 
j++24
)
2840
48
      
for (k = 0; 24
k < destination_size;
k++24
)
2841
24
      {
2842
24
        const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2843
24
        const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2844
24
        if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2845
24
          _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2846
24
      }
2847
18
  }
2848
21
}
2849
2850
static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2851
21
{
2852
21
  int i, j;
2853
21
  ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2854
21
  ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2855
  // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2856
  // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2857
  // No need to change anything, we are good.
2858
21
  const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2859
21
  if (!unroll_count)
2860
8
    return;
2861
  // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2862
  // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2863
13
  ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2864
13
  int* dup_exec_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2865
13
  int* dup_tensor_block_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2866
13
  _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2867
13
  ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2868
13
  ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2869
26
  ccv_nnc_graph_visit_t* dup_visit = 
ccv_nnc_graph_visit_new13
(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0);
2870
13
  ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2871
26
  _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2872
  // Free out the old exec_dep
2873
26
  ccv_matrix_free(exec_dep);
2874
  // and the tensor blocks, prepare for the new.
2875
26
  _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2876
  // A reverse map to find where the original tensor comes from.
2877
26
  int* dup_tensor_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2878
142
  for (i = 0; i < dup_graph->tensor_symbol_info->rnum; 
i++129
)
2879
129
    dup_tensor_from_ref[i] = -1;
2880
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2881
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2882
115
      if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2883
104
        dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2884
26
  int* dup_exec_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2885
90
  for (i = 0; i < dup_graph->exec_symbol_info->rnum; 
i++77
)
2886
77
    dup_exec_from_ref[i] = -1;
2887
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2888
35
  {
2889
35
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2890
2
      continue;
2891
33
    dup_exec_from_ref[i] = i; // Reference back.
2892
77
    for (j = 0; j < unroll_count; 
j++44
)
2893
44
      if (dup_exec_ref[i * unroll_count + j] >= 0)
2894
44
        dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2895
33
  }
2896
  // Reset all attr.
2897
26
  memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2898
26
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2899
26
  ccv_nnc_graph_visit_free(dup_visit);
2900
26
  
ccfree13
(dup_exec_symbol_info);
2901
26
  
ccfree13
(dup_exec_from_ref);
2902
26
  
ccfree13
(dup_tensor_from_ref);
2903
  // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2904
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2905
    // Loop over all possible duplications to assign dup_p_ref properly.
2906
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2907
115
    {
2908
115
      const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2909
115
      if (dup_idx >= 0 && 
(104
tensor_blocks[i].p_refs[0]104
||
tensor_blocks[i].p_refs[1]60
))
2910
44
      {
2911
44
        const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2912
44
        const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2913
44
        if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2914
28
        {
2915
28
          if (!tensor_blocks[dup_idx].dup_p_refs)
2916
22
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2917
28
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2918
28
        }
2919
44
        if (p_ref_0_is_in_or_out == 1 || 
tensor_blocks[i].p_refs[1] == 016
)
2920
44
          continue;
2921
0
        const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2922
0
        const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2923
0
        if (p_ref_1_is_in_or_out == 1)
2924
0
        {
2925
0
          if (!tensor_blocks[dup_idx].dup_p_refs)
2926
0
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2927
0
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2928
0
        }
2929
0
      }
2930
115
    }
2931
  // companion_ref
2932
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2933
    // Now can assign them (The dup) as companion.
2934
78
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
dup_tensor_symbol_info[i].assign_ref71
)
2935
17
    {
2936
      // Get to the last one, which we will wrap over.
2937
17
      const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2938
17
      if (assign_ref >= 0)
2939
17
      {
2940
17
        int b_ref = assign_ref;
2941
17
        while (tensor_blocks[b_ref].ref)
2942
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2943
17
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2944
17
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2945
        // It cannot be that both i can hop to j can j can hop to i.
2946
        // And it can be hop from one to another now after duplication.
2947
17
        assert(a_hop_b > 0 || b_hop_a > 0);
2948
17
        tensor_blocks[i].companion_ref = b_ref + 1;
2949
17
        tensor_blocks[b_ref].companion_ref = i + 1;
2950
17
      }
2951
17
    }
2952
13
  ccfree(dup_tensor_symbol_info);
2953
  // Extend the dup tensor block ref, prepare for future extensions.
2954
13
  dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2955
110
  for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; 
i++97
)
2956
97
    dup_tensor_block_ref[i] = -1;
2957
  // Assign out changed properties.
2958
13
  *r_exec_dep = exec_dep;
2959
13
  *r_tensor_blocks = tensor_blocks;
2960
13
  *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2961
13
  *r_dup_graph = dup_graph;
2962
13
  *r_unroll_count = unroll_count;
2963
13
  *r_dup_exec_ref = dup_exec_ref;
2964
13
  *r_dup_tensor_block_ref = dup_tensor_block_ref;
2965
13
}
2966
2967
static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2968
31
{
2969
31
  if (!anonymous_block_free_list || 
!anonymous_block_free_list_cap15
)
2970
28
    return tensor_block_size;
2971
3
  int i;
2972
3
  const int no_dup_p_refs = (!dup_p_refs || 
!dup_p_refs->rnum0
);
2973
3
  int found_idx = tensor_block_size;
2974
3
  for (i = 0; i < anonymous_block_free_list_cap; 
i++0
)
2975
3
  {
2976
3
    const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i);
2977
3
    assert(idx < tensor_block_size);
2978
    // If the type doesn't match, ignore.
2979
3
    if (tensor_blocks[idx].type != type)
2980
0
      continue;
2981
    // Heuristic about how to select the best tensor block to move forward.
2982
    // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
2983
3
    if (tensor_blocks[idx].size >= size)
2984
3
    {
2985
3
      if (no_dup_p_refs)
2986
3
        return idx;
2987
      // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
2988
      // then we cannot do better than this, if that is the case, just return.
2989
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
2990
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
2991
0
        return idx;
2992
0
    }
2993
0
    int64_t found_idx_size_diff;
2994
0
    int64_t idx_size_diff;
2995
0
    if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
2996
      // Now, compare whether this one or the found_idx one is better.
2997
      // At this point, there is no point of comparing the dup_p_refs, we only care about which one
2998
      // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
2999
0
      (found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3000
0
    {
3001
0
      found_idx = idx;
3002
0
      continue;
3003
0
    }
3004
    // No need to update if found_idx is better than idx.
3005
0
    if (found_idx_size_diff > idx_size_diff)
3006
0
      continue;
3007
    // We bias towards the bigger one in case of similar.
3008
0
    if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3009
0
    {
3010
0
      found_idx = idx;
3011
0
      continue;
3012
0
    }
3013
0
    assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size);
3014
    // On a tie, check which one has tighter life-cycle.
3015
0
    if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3016
0
    {
3017
      // Check whether the current tensor blocks life-cycle is longer than the previous one.
3018
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3019
0
        (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3020
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3021
0
        found_idx = idx;
3022
0
      continue;
3023
0
    }
3024
    // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3025
    // We prefer to choose the one that has life-cycle closer to the expected ones.
3026
0
    if (no_dup_p_refs)
3027
0
    {
3028
      // Whoever is shorter wins.
3029
0
      if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3030
0
        (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3031
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3032
0
        found_idx = idx;
3033
0
      continue;
3034
0
    }
3035
0
    if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3036
0
      continue;
3037
0
    if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3038
0
    {
3039
0
      found_idx = idx;
3040
0
      continue;
3041
0
    }
3042
    // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3043
0
    const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3044
0
    const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3045
0
    if (idx_after_request && found_idx_after_request)
3046
0
    {
3047
0
      if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3048
0
        found_idx = idx;
3049
0
      continue;
3050
0
    } else {
3051
      // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3052
      // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3053
      // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3054
0
      if (!found_idx_after_request && (idx_after_request ||
3055
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3056
0
        found_idx = idx;
3057
0
      continue;
3058
0
    }
3059
0
  }
3060
0
  return found_idx;
3061
3
}
3062
3063
static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3064
49
{
3065
49
  if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3066
28
    return 0;
3067
21
  int i, j, k;
3068
21
  int input_size = 0;
3069
43
  for (i = 0; i < p_node_info->p_while.input_size; 
i++22
)
3070
22
    if (p_node_info->p_while.inputs[i] >= 0)
3071
2
      ++input_size;
3072
  // If doesn't have tensor inputs (thus, only special inputs), just return.
3073
21
  if (!input_size)
3074
19
    return 0;
3075
2
  ccv_nnc_tensor_symbol_t inputs[input_size];
3076
2
  input_size = 0;
3077
6
  for (i = 0; i < p_node_info->p_while.input_size; 
i++4
)
3078
4
    if (p_node_info->p_while.inputs[i] >= 0)
3079
2
      inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3080
2
        .d = p_node_info->p_while.inputs[i],
3081
2
        .graph = symbolic_graph,
3082
2
      };
3083
2
  assert(symbolic_graph->breakpoint_size > 0);
3084
2
  ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3085
2
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3086
4
  for (i = 0; i < symbolic_graph->breakpoint_size; 
i++2
)
3087
2
  {
3088
    // Make a noop copy of the breakpoint, but with some tensor inputs.
3089
2
    ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), inputs, input_size, 0, 0, 0);
3090
2
    ccv_array_push(dup_breakpoints, &noop);
3091
    // Connect this noop to the outgoing nodes of breakpoints.
3092
2
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d);
3093
2
    if (symbol_info->outgoings)
3094
4
      
for (j = 0; 2
j < symbol_info->outgoings->rnum;
j++2
)
3095
2
      {
3096
2
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
3097
2
        ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3098
2
          .d = d,
3099
2
          .graph = symbolic_graph,
3100
2
        });
3101
2
      }
3102
2
  }
3103
7
  for (i = 0; i < exec_symbol_info_size; 
i++5
)
3104
5
  {
3105
5
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
3106
5
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags))
3107
0
      continue;
3108
5
    if (symbol_info->outgoings)
3109
3
    {
3110
3
      const int outgoing_size = symbol_info->outgoings->rnum;
3111
6
      for (j = 0; j < outgoing_size; 
j++3
)
3112
3
      {
3113
3
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
3114
6
        for (k = 0; k < symbolic_graph->breakpoint_size; 
k++3
)
3115
3
          if (d == symbolic_graph->breakpoints[k].d)
3116
0
          {
3117
0
            ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k);
3118
0
            ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3119
0
              .d = i,
3120
0
              .graph = symbolic_graph,
3121
0
            }, noop);
3122
            // Found, connected, exit.
3123
0
            break;
3124
0
          }
3125
3
      }
3126
3
    }
3127
5
  }
3128
  // Add the dup_breakpoints to source if neccessary.
3129
2
  assert(symbolic_graph->sources);
3130
2
  const int source_size = symbolic_graph->sources->rnum;
3131
4
  for (i = 0; i < source_size; 
i++2
)
3132
2
  {
3133
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i))->d;
3134
2
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++0
)
3135
2
      if (d == symbolic_graph->breakpoints[j].d)
3136
2
      {
3137
2
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
3138
2
        ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3139
        // Found, made, exit.
3140
2
        break;
3141
2
      }
3142
2
  }
3143
  // Add the dup_breakpoints to destination if neccessary.
3144
2
  assert(symbolic_graph->destinations);
3145
2
  const int destination_size = symbolic_graph->destinations->rnum;
3146
4
  for (i = 0; i < destination_size; 
i++2
)
3147
2
  {
3148
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i))->d;
3149
4
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++2
)
3150
2
      if (d == symbolic_graph->breakpoints[j].d)
3151
0
      {
3152
0
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
3153
0
        ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3154
        // Found, made, exit.
3155
0
        break;
3156
0
      }
3157
2
  }
3158
2
  return dup_breakpoints;
3159
2
}
3160
3161
// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3162
static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3163
6.22k
{
3164
6.22k
  assert(source_size > 0);
3165
6.22k
  assert(destination_size > 0);
3166
  // First, fill all the "auto" holes.
3167
  // This is the symbol table that with "auto" info filled up.
3168
6.22k
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3169
6.22k
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3170
6.22k
  ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3171
12.4k
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new6.22k
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
3172
0
  ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3173
12.4k
  int i, j, k, p, q;
3174
12.4k
  const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = 
p_exec_symbol_info6.22k
?
p_exec_symbol_info + (symbolic_graph->exec_idx - 1)49
:
06.17k
;
3175
12.4k
  ccv_sparse_matrix_t* exec_dep;
3176
12.4k
  ccv_nnc_tensor_block_t* tensor_blocks;
3177
12.4k
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3178
12.4k
  int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3179
  // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3180
  // are automatically filled in, and all the sub-graphs are processed.
3181
  // There is a last step though, for a while loop, it is parameterized:
3182
  // while (x > 5) {
3183
  //     y = x + 1;
3184
  // } (y => x) // This means after this loop is done, y's value will be copied over to x.
3185
  // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3186
  // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3187
  // it is a inplace operation.
3188
  // But if y cannot be x's alias, for example, this while loop looks like this:
3189
  // while (x > 5) {
3190
  //     y = x + a
3191
  //     b = x + y
3192
  // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3193
  // For this example, y cannot be x's alias because x is used later to compute b (and that computation
3194
  // has dependency on y as well).
3195
  // For this case, we need to modify the computation graph. Previously, the graph looks like this:
3196
  // y = x + a -> b = x + y
3197
  // This graph will be extended to look like this:
3198
  // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3199
  // while (x0 > 5) {
3200
  //     y0 = x0 + a0
3201
  //     b0 = x0 + y0
3202
  //     if (y0 > 5) break
3203
  //     y1 = y0 + b0
3204
  //     b1 = y0 + y1
3205
  // } (y1 => x0, b1 => a0)
3206
  // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3207
  // with each other now).
3208
  // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3209
  // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3210
12.4k
  ccv_nnc_symbolic_graph_t* dup_graph = 0;
3211
12.4k
  int* dup_exec_ref = 0;
3212
12.4k
  int* dup_tensor_block_ref = 0;
3213
12.4k
  int unroll_count = 0;
3214
  // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3215
12.4k
  ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)
ccmalloc6.22k
(sizeof(ccv_nnc_symbolic_graph_prep_t));
3216
12.4k
  prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3217
12.4k
  prep->flags = 0;
3218
  // Cannot handle dup a node that is a graph as well.
3219
12.4k
  if (
p_exec_symbol_info6.22k
)
3220
49
  {
3221
49
    prep->flags = p_node_info->flags;
3222
49
    if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3223
21
    {
3224
21
      _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3225
21
      _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3226
28
    } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3227
      // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3228
28
    }
3229
49
  }
3230
12.4k
  ccv_nnc_symbolic_graph_prep_t** sub_preps = 
symbolic_graph->sub_graphs6.22k
&&
symbolic_graph->sub_graphs->rnum29
?
(ccv_nnc_symbolic_graph_prep_t**)29
cccalloc29
(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) :
06.19k
;
3231
12.4k
  ccv_array_t* anonymous_block_free_list = 0;
3232
12.4k
  const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3233
  // Record whether this tensor is folded in this round.
3234
12.4k
  uint32_t* const tensor_fold = (uint32_t*)
ccmalloc6.22k
(sizeof(uint32_t) * tensor_fold_size);
3235
32.1k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
3236
32.1k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3237
49
    {
3238
49
      assert(symbolic_graph->sub_graphs);
3239
49
      ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1);
3240
49
      ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3241
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3242
49
      sub_prep->dup_breakpoints = dup_breakpoints;
3243
49
      sub_prep->p = prep;
3244
49
      sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1] = sub_prep;
3245
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3246
49
      const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3247
293
      for (i = 0; i < s_alloc_prep->block_size; 
i++244
)
3248
244
      {
3249
244
        const int block_ref = s_alloc_prep->blocks[i].block_ref;
3250
244
        const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3251
244
        if (block_ref < sub_prep->tensor_symbol_info_size)
3252
192
        {
3253
          // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3254
          // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3255
192
          if (s_tensor_blocks[block_ref].bypass_ref)
3256
1
          {
3257
1
            int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3258
1
            while (s_tensor_blocks[bypass_ref].ref)
3259
0
              bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3260
1
            if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3261
1
              
s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1]0
)
3262
1
              continue;
3263
1
          }
3264
191
          if (s_tensor_blocks[block_ref].p_refs[0])
3265
91
          {
3266
            /* If it is already properly assigned, next. */
3267
91
            if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3268
91
              s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3269
91
            {
3270
91
              if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3271
90
                s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3272
1
              else {
3273
1
                assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3274
1
                s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3275
1
              }
3276
91
            }
3277
            /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3278
91
            if (s_tensor_blocks[block_ref].p_refs[1] &&
3279
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1]3
&&
3280
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1]3
)
3281
3
            {
3282
3
              assert(s_alloc_prep->buffers[buffer_ref].p_refs[0]);
3283
3
              assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3284
3
              s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3285
3
            }
3286
91
          }
3287
191
        } else 
if (52
s_tensor_blocks[block_ref].dup_p_refs52
) {
3288
          /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3289
           * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3290
           * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3291
           * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3292
           * its life-time to the end of the output tensor. */
3293
15
          if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3294
13
            s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3295
33
          for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; 
j++18
)
3296
18
            ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j));
3297
15
        }
3298
244
      }
3299
49
    }
3300
32.1k
    const int init_tensor_block_size = tensor_block_size;
3301
32.1k
    int rw_anonymous_buffer_size_cap = 0;
3302
32.1k
    int ro_anonymous_buffer_size_cap = 0;
3303
32.1k
    if (anonymous_block_free_list)
3304
17
      ccv_array_clear(anonymous_block_free_list);
3305
32.1k
    memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3306
32.1k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3307
49
    {
3308
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1];
3309
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3310
49
      int rw_anonymous_buffer_size = 0;
3311
49
      int ro_anonymous_buffer_size = 0;
3312
229
      for (i = 0; i < s_alloc_prep->buffer_size; 
i++180
)
3313
180
        if (s_alloc_prep->buffers[i].p_refs[0])
3314
90
        {
3315
          /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3316
90
          int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3317
          /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3318
90
          int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3319
90
          assert(p_ref_0_is_in_or_out != 0);
3320
90
          int unref_p_ref_0 = p_ref_0;
3321
92
          while (tensor_blocks[unref_p_ref_0].ref)
3322
2
            unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3323
          /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3324
90
          assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3325
90
          if (s_alloc_prep->buffers[i].p_refs[1])
3326
4
          {
3327
4
            int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3328
4
            const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3329
4
            assert(p_ref_1_is_in_or_out != 0);
3330
4
            int unref_p_ref_1 = p_ref_1;
3331
4
            while (tensor_blocks[unref_p_ref_1].ref)
3332
0
              unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3333
            /* See above comment for the similar p_ref_0 check. */
3334
4
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]));
3335
4
            assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out);
3336
4
            int p_ref_t;
3337
4
            if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3338
3
            {
3339
3
              CCV_SWAP(p_ref_0, p_ref_1, p_ref_t);
3340
3
              CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t);
3341
3
            }
3342
4
            p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3343
            /* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3344
4
            if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
3345
4
            {
3346
4
              const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3347
4
              if (folded)
3348
1
              {
3349
1
                p_ref_0 = p_ref_1;
3350
1
                unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3351
1
                tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3352
1
                for (j = 0; j < unroll_count; 
j++0
) /* Fold its duplicates as well. */
3353
0
                {
3354
0
                  const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3355
0
                  assert(folded && "the subsequent duplicates can be folded too.");
3356
0
                }
3357
1
              }
3358
4
            }
3359
4
          }
3360
          /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3361
           * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3362
           * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3363
           * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3364
           * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3365
           * associated with it, then we are good. */
3366
90
          if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3367
90
            
(89
p_ref_0_is_in_or_out == 189
&&
_ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)50
) ||
3368
90
            
(39
p_ref_0_is_in_or_out == -139
&&
_ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)39
) ||
3369
90
            
TENSOR_READ_WRITE8
(s_alloc_prep->buffers[i]) == READ_ONLY8
)
3370
86
          {
3371
86
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3372
27
              { assert(s_alloc_prep->buffers[i].p_refs[1] == 0); }
3373
            /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3374
             * is a long argument why that is the case, the digest is, it is much easier to control your output
3375
             * than your input). */
3376
86
            s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3377
86
            s_alloc_prep->buffers[i].p_refs[1] = 0;
3378
            /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3379
86
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3380
86
            tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size);
3381
95
            for (j = 0; j < unroll_count; 
j++9
) /* Change the size of its duplicates as well. */
3382
9
              tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3383
9
                tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3384
9
                  tensor_blocks[unref_p_ref_0].size;
3385
86
          } else {
3386
4
            s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3387
4
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3388
0
              ++ro_anonymous_buffer_size;
3389
4
            else
3390
4
              rw_anonymous_buffer_size += unroll_count + 1;
3391
4
          }
3392
90
        } else {
3393
90
          if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3394
63
            ++ro_anonymous_buffer_size;
3395
27
          else
3396
27
            rw_anonymous_buffer_size += unroll_count + 1;
3397
90
        }
3398
49
      if (ro_anonymous_buffer_size || 
rw_anonymous_buffer_size24
)
3399
28
      {
3400
28
        const int anonymous_block_free_list_cap = anonymous_block_free_list ? 
anonymous_block_free_list->rnum6
:
022
;
3401
        // All read-write buffer (potentially) can be reused between each case..of branch.
3402
28
        rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3403
        // Read-only buffer cannot be reused between each case..of branch.
3404
28
        ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3405
        /* Anonymous block, allocate additional tensor blocks for this. */
3406
        /* This is either because this is an internal tensor (don't have p_ref) */
3407
        /* or it is an anonymous block itself within the sub graphs of this while graph. */
3408
28
        tensor_blocks = (ccv_nnc_tensor_block_t*)ccrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3409
28
        memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3410
28
        if (dup_tensor_block_ref)
3411
3
          dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3412
174
        for (i = 0; i < s_alloc_prep->buffer_size; 
i++146
)
3413
146
          if (!s_alloc_prep->buffers[i].p_refs[0])
3414
94
          {
3415
94
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3416
63
            {
3417
63
              assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap);
3418
63
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size]);
3419
63
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3420
63
              tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3421
63
              tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3422
63
              tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3423
63
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3424
63
              tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3425
63
              ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3426
63
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3427
63
              if (dup_p_refs && 
dup_p_refs->rnum > 00
)
3428
0
              {
3429
0
                for (j = 0; j < dup_p_refs->rnum; j++)
3430
0
                {
3431
0
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3432
0
                  assert(dup_p_ref >= 0);
3433
0
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3434
0
                  assert(tensor_blocks[dup_p_ref].tail);
3435
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3436
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3437
0
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3438
0
                  {
3439
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3440
0
                    assert(p_node_info);
3441
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3442
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3443
0
                    {
3444
0
                      if (!tensor_blocks[tensor_block_size].dup_p_refs)
3445
0
                        tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3446
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3447
0
                    }
3448
0
                  }
3449
0
                  if (!tensor_blocks[tensor_block_size].tail)
3450
0
                    tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3451
0
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3452
0
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_size]);
3453
0
                }
3454
63
              } else {
3455
63
                tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3456
63
                ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3457
63
              }
3458
132
              
for (j = 0; 63
j < source_size;
j++69
)
3459
69
                _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3460
              /* If this is a read-only (based on SSA, if first encountered as read), and this is
3461
               * sub-graph. Mark it to the end of the graph. */
3462
63
              if (p_exec_symbol_info)
3463
12
                
for (j = 0; 6
j < destination_size;
j++6
)
3464
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3465
              /* If it is read-only, it is self-reflecting. */
3466
69
              for (k = 0; k < unroll_count; 
k++6
)
3467
6
              {
3468
12
                for (j = 0; j < destination_size; 
j++6
)
3469
6
                  if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3470
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3471
                /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3472
6
                assert(symbolic_graph->p);
3473
6
                dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3474
6
              }
3475
63
              ++tensor_block_size;
3476
63
            } else {
3477
31
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3478
31
              const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3479
31
              const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3480
              // Find suitable tensor block from the free list.
3481
31
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3482
31
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3483
31
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3484
31
              if (new_anonymous_tensor_block)
3485
28
              {
3486
28
                tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3487
28
                tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3488
28
                tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3489
28
                tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3490
28
                ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3491
28
              } else {
3492
3
                tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3493
3
                tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3494
3
              }
3495
31
              if (dup_p_refs && 
dup_p_refs->rnum > 05
)
3496
5
              {
3497
10
                for (j = 0; j < dup_p_refs->rnum; 
j++5
)
3498
5
                {
3499
5
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3500
5
                  assert(dup_p_ref >= 0);
3501
5
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3502
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3503
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3504
5
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3505
0
                  {
3506
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3507
0
                    assert(p_node_info);
3508
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3509
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3510
0
                    {
3511
0
                      if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3512
0
                        tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3513
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3514
0
                    }
3515
0
                  }
3516
5
                  assert(tensor_blocks[dup_p_ref].tail);
3517
5
                  if (!tensor_blocks[tensor_block_idx].tail)
3518
5
                    tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3519
10
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; 
k++5
)
3520
5
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_idx]);
3521
                  // We have to add it to the warp around companion_ref as well.
3522
                  // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3523
                  // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3524
                  // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3525
                  // gaurantee may be broken down in the line.
3526
5
                  if (tensor_blocks[dup_p_ref].companion_ref)
3527
0
                  {
3528
0
                    const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3529
0
                    for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3530
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3531
0
                    for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3532
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3533
0
                  }
3534
5
                }
3535
26
              } else if (new_anonymous_tensor_block) {
3536
23
                tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3537
23
                ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3538
23
              }
3539
31
              const int prev_tensor_block_idx = tensor_block_idx;
3540
31
              if (new_anonymous_tensor_block)
3541
28
              {
3542
28
                if (!anonymous_block_free_list)
3543
16
                  anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3544
28
                ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3545
28
                ++tensor_block_size;
3546
28
              }
3547
32
              for (k = 0; k < unroll_count; 
k++1
)
3548
1
              {
3549
1
                const int tensor_block_idx = new_anonymous_tensor_block ?
3550
1
                  (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3551
1
                  
dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k]0
;
3552
1
                TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3553
1
                TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3554
1
                if (new_anonymous_tensor_block)
3555
1
                {
3556
1
                  tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3557
1
                  tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3558
1
                  tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3559
1
                  tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3560
                  /* Attach to duplicated exec for this tensor block. */
3561
1
                  ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3562
1
                } else {
3563
0
                  tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3564
0
                  tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3565
0
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3566
3567
0
                }
3568
1
                if (dup_p_refs && dup_p_refs->rnum > 0)
3569
1
                {
3570
                  /* Not nil, not self-reflecting. */
3571
2
                  for (j = 0; j < dup_p_refs->rnum; 
j++1
)
3572
1
                  {
3573
1
                    const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3574
1
                    assert(dup_p_ref >= 0);
3575
1
                    assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3576
                    // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3577
                    // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3578
1
                    if (tensor_symbol_info[dup_p_ref].p_ref)
3579
0
                    {
3580
0
                      const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3581
0
                      assert(p_node_info);
3582
0
                      const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3583
0
                      if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3584
0
                      {
3585
0
                        if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3586
0
                          tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3587
0
                        ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3588
0
                      }
3589
0
                    }
3590
1
                    assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref);
3591
1
                    const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3592
1
                    assert(tensor_blocks[dup_dup_p_ref].tail);
3593
1
                    if (!tensor_blocks[tensor_block_idx].tail)
3594
1
                      tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3595
2
                    for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; 
q++1
)
3596
1
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q), tensor_blocks[tensor_block_idx]);
3597
                    // We have to add it to the warp around companion_ref as well.
3598
1
                    if (tensor_blocks[dup_dup_p_ref].companion_ref)
3599
0
                    {
3600
0
                      const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3601
0
                      for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3602
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3603
0
                      for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3604
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3605
0
                    }
3606
1
                  }
3607
1
                } else 
if (0
new_anonymous_tensor_block0
) {
3608
0
                  tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3609
0
                  ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3610
0
                }
3611
1
                if (new_anonymous_tensor_block)
3612
1
                  ++tensor_block_size;
3613
1
              }
3614
31
            }
3615
94
          }
3616
28
      }
3617
49
    }
3618
32.1k
  } ccv_nnc_graph_visit_endfor
3619
6.22k
  if (anonymous_block_free_list)
3620
16
    ccv_array_free(anonymous_block_free_list);
3621
6.22k
  ccfree(tensor_fold);
3622
  // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3623
  // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3624
6.22k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3625
6.22k
  prep->while_count_tensor = 0;
3626
6.22k
  prep->dup_breakpoints = 0;
3627
6.22k
  prep->p = 0;
3628
6.22k
  prep->symbolic_graph = symbolic_graph;
3629
6.22k
  prep->p_idx = symbolic_graph->p_idx;
3630
6.22k
  prep->exec_idx = symbolic_graph->exec_idx;
3631
6.22k
  prep->sub_prep_size = symbolic_graph->sub_graphs ? 
symbolic_graph->sub_graphs->rnum29
:
06.19k
;
3632
6.22k
  prep->sub_preps = sub_preps;
3633
6.22k
  prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3634
6.22k
  prep->exec_symbol_info = exec_symbol_info;
3635
6.22k
  prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3636
6.22k
  prep->tensor_symbol_info = tensor_symbol_info;
3637
6.22k
  prep->unroll_count = unroll_count;
3638
6.22k
  prep->dup_tensor_block_ref = dup_tensor_block_ref;
3639
6.22k
  prep->tensor_block_size = tensor_block_size;
3640
6.22k
  prep->tensor_blocks = tensor_blocks;
3641
6.22k
  prep->exec_flags = exec_flags;
3642
6.22k
  prep->visit = visit;
3643
6.22k
  prep->alloc_prep = alloc_prep;
3644
6.22k
  if (dup_graph)
3645
13
    ccv_nnc_symbolic_graph_free(dup_graph);
3646
6.22k
  if (dup_exec_ref)
3647
13
    ccfree(dup_exec_ref);
3648
6.22k
  return prep;
3649
12.4k
}
3650
3651
static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3652
6.22k
{
3653
6.22k
  int i;
3654
6.22k
  _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3655
6.22k
  ccfree(prep->exec_flags);
3656
6.27k
  for (i = 0; i < prep->sub_prep_size; 
i++50
)
3657
50
    if (prep->sub_preps[i])
3658
49
      _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3659
6.22k
  if (prep->sub_preps)
3660
29
    ccfree(prep->sub_preps);
3661
6.22k
  ccfree(prep->tensor_symbol_info);
3662
6.22k
  ccfree(prep->exec_symbol_info);
3663
6.22k
  if (prep->dup_tensor_block_ref)
3664
13
    ccfree(prep->dup_tensor_block_ref);
3665
6.22k
  _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3666
6.22k
  ccv_nnc_graph_visit_free(prep->visit);
3667
6.22k
  ccfree(prep);
3668
6.22k
}
3669
3670
static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3671
6.22k
{
3672
6.22k
  int i, j;
3673
32.1k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
3674
32.1k
    if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3675
21
    {
3676
21
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3677
21
      assert(graph_ref >= 0);
3678
21
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3679
43
      for (i = 0; i < node->p_while.input_size; 
i++22
)
3680
22
        if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i]))
3681
20
        {
3682
20
          ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3683
20
          const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i]);
3684
21
          for (j = 0; j < d; 
j++1
)
3685
1
            prep = prep->p;
3686
20
          prep->while_count_tensor = 1;
3687
20
        }
3688
21
    }
3689
32.1k
    
for (i = 0; 32.1k
i < node->graph_ref_size;
i++49
)
3690
49
    {
3691
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3692
49
      if (graph_ref >= 0)
3693
49
        _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3694
49
    }
3695
32.1k
  } ccv_nnc_graph_visit_endfor
3696
6.22k
}
3697
3698
static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3699
90.7k
{
3700
90.7k
  if (symbol >= 0)
3701
64.2k
    return graph_prep->tensor_arena->vt_tensors[symbol];
3702
26.5k
  if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3703
26.5k
    return 0;
3704
26.5k
  assert
(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))20
;
3705
20
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3706
20
  int i;
3707
20
  const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol);
3708
21
  for (i = 0; i < d; 
i++1
)
3709
1
    prep = prep->p;
3710
20
  assert(prep->while_count_tensor);
3711
20
  return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3712
20
}
3713
3714
static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3715
6.22k
{
3716
6.22k
  int i;
3717
6.22k
  int* const exec_cvt = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
3718
6.22k
  ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3719
6.22k
  graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3720
6.22k
  graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3721
6.22k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3722
58.2k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++52.0k
)
3723
52.0k
    if (graph_execs[i].graph == graph)
3724
32.1k
      graph_execs[i].d = exec_cvt[graph_execs[i].d];
3725
6.22k
  ccfree(exec_cvt);
3726
6.22k
}
3727
3728
static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3729
6.22k
{
3730
6.22k
  int i, j, k;
3731
6.22k
  ccv_nnc_graph_t* const graph = graph_prep->graph;
3732
6.22k
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3733
6.22k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
3734
6.22k
  graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3735
6.22k
  graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3736
6.22k
  graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3737
6.22k
  graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3738
6.22k
  memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3739
6.22k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3740
6.22k
  int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3741
58.2k
  for (i = 0; i < exec_symbol_info_size; 
i++52.0k
)
3742
52.0k
  {
3743
52.0k
    max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size);
3744
52.0k
    max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size);
3745
52.0k
    if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3746
22
      max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size);
3747
52.0k
    graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3748
52.0k
    graph_execs[i].graph = 0;
3749
52.0k
  }
3750
6.27k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
3751
50
    max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size);
3752
6.22k
  ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)];
3753
6.22k
  ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)];
3754
6.22k
  ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)];
3755
6.22k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3756
6.22k
  const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3757
  // Create node, this is in topological order.
3758
32.1k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3759
32.1k
    if (CCV_NO_GRAPH_EXEC(graph_execs[idx]))
3760
32.1k
    {
3761
122k
      for (i = 0; i < node->input_size; 
i++90.7k
)
3762
90.7k
        max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3763
82.3k
      for (i = 0; i < node->output_size; 
i++50.2k
)
3764
50.2k
        max_outputs[i] = node->outputs[i] >= 0 ? 
tensor_arena->vt_tensors[node->outputs[i]]41.3k
:
08.91k
;
3765
32.1k
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3766
21
      {
3767
21
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3768
21
        assert(graph_ref >= 0);
3769
21
        ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3770
21
        ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3771
21
        graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3772
21
        const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3773
21
        ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3774
21
        ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3775
43
        for (i = 0; i < node->p_while.input_size; 
i++22
)
3776
22
          max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3777
42
        for (i = 0; i < sub_symbolic_graph->breakpoint_size; 
i++21
)
3778
21
          max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3779
21
        ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3780
21
        _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3781
32.0k
      } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3782
24
        for (i = 0; i < node->output_size; 
i++13
)
3783
13
          if (max_outputs[i] && max_outputs[i]->alias_ref)
3784
10
            max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3785
11
        graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3786
        // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3787
22
        for (i = 0; i < node->case_of.argument.offset; 
i++11
)
3788
11
        {
3789
11
          ccv_nnc_tensor_t* const update = max_inputs[i];
3790
11
          if (!CCV_IS_TENSOR_MULTIVIEW(update)) // No need if it is a naked tensor.
3791
9
            continue;
3792
2
          int flag = 0;
3793
2
          for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; 
j++0
)
3794
0
            flag = (update == max_inputs[j]);
3795
2
          if (!flag)
3796
2
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3797
2
        }
3798
11
        const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 
11
:
010
;
3799
11
        ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3800
11
        if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3801
1
        {
3802
          // Add another graph for data transfer.
3803
1
          ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3804
2
          for (i = 0; i < node->output_size; 
i++1
)
3805
1
            max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 
00
;
3806
1
          ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size), max_outputs, ccv_min(node->input_size, node->output_size));
3807
1
          ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3808
1
          ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3809
1
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3810
1
          int exec_cvt;
3811
1
          ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3812
1
        }
3813
39
        for (i = 0; i < node->graph_ref_size; 
i++28
)
3814
28
        {
3815
28
          const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3816
28
          if (graph_ref < 0)
3817
0
            continue;
3818
28
          ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3819
28
          const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3820
28
          ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3821
28
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3822
28
          _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3823
28
        }
3824
32.0k
      } else {
3825
32.0k
        graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3826
32.0k
      }
3827
32.1k
      ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3828
32.1k
    }
3829
32.1k
  } ccv_nnc_graph_visit_endfor
3830
  // Then connect them.
3831
32.1k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3832
32.1k
    if (node->outgoings)
3833
53.1k
      
for (i = 0; 25.2k
i < node->outgoings->rnum;
i++27.9k
)
3834
27.9k
      {
3835
27.9k
        const int outgoing = *(int*)ccv_array_get(node->outgoings, i);
3836
27.9k
        if (graph_execs[outgoing].graph)
3837
27.6k
          ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3838
27.9k
      }
3839
32.1k
  } ccv_nnc_graph_visit_endfor
3840
6.22k
  int source_exec_created = 0;
3841
6.22k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3842
6.22k
  const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3843
6.22k
  ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3844
  // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3845
97.5k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++91.2k
)
3846
91.2k
  {
3847
91.2k
    if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags))
3848
127
    {
3849
127
      int ref = i;
3850
127
      while (tensor_symbol_info[ref].alias_ref)
3851
0
        ref = tensor_symbol_info[ref].alias_ref - 1;
3852
127
      while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref39
)
3853
0
        ref = tensor_blocks[ref].ref - 1;
3854
      // This is not computable. It could be that we marked a const tensor as init zero.
3855
127
      if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]))
3856
39
        continue;
3857
      // If this tensor is not used by any exec, we don't need to init at all. Skip.
3858
88
      if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3859
0
        continue;
3860
88
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3861
      // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3862
88
      ccv_nnc_graph_exec_t set_exec;
3863
88
      if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3864
27
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3865
61
      else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3866
61
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3867
176
      for (j = 0; j < tensor_blocks[ref].head->rnum; 
j++88
)
3868
88
      {
3869
88
        const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j);
3870
88
        if (outgoing >= exec_symbol_info_size)
3871
0
          continue;
3872
88
        assert(outgoing >= 0);
3873
88
        assert(graph_execs[outgoing].graph);
3874
88
        ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3875
88
      }
3876
88
      int flags = 0;
3877
88
      if (alloc_dep[ref])
3878
50
        
for (j = 0; 25
j < alloc_dep[ref]->rnum;
j++25
)
3879
25
        {
3880
25
          const int d = *(int*)ccv_array_get(alloc_dep[ref], j);
3881
          // This is from alloc_dep, it should be computable.
3882
25
          assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
3883
25
          if (tensor_blocks[d].tail)
3884
50
            
for (k = 0; 25
k < tensor_blocks[d].tail->rnum;
k++25
)
3885
25
            {
3886
25
              const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
3887
25
              if (incoming >= exec_symbol_info_size)
3888
0
                continue;
3889
25
              assert(incoming >= 0);
3890
25
              assert(graph_execs[incoming].graph);
3891
25
              ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3892
25
              flags = 1;
3893
25
            }
3894
25
        }
3895
      // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3896
88
      if (!flags)
3897
63
      {
3898
63
        if (!source_exec_created)
3899
40
        {
3900
40
          graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3901
40
          source_exec_created = 1;
3902
40
        }
3903
63
        ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3904
63
      }
3905
88
    }
3906
91.2k
  }
3907
  // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3908
  // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3909
  // with its alias).
3910
6.22k
  assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size);
3911
97.5k
  
for (i = 0; 6.22k
i < tensor_arena->vt_tensor_size;
i++91.2k
)
3912
91.2k
  {
3913
91.2k
    ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3914
    // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3915
91.2k
    if (mv && 
CCV_IS_TENSOR_MULTIVIEW82.9k
(mv))
3916
53
    {
3917
53
      const ccv_array_t* const head = tensor_blocks[i].head;
3918
53
      if (head && 
head->rnum > 047
)
3919
94
        
for (j = 0; 47
j < head->rnum;
j++47
)
3920
47
        {
3921
47
          const int idx = *(int*)ccv_array_get(head, j);
3922
47
          if (idx >= exec_symbol_info_size)
3923
1
            continue;
3924
47
          assert
(idx >= 0)46
;
3925
46
          const int d = graph_execs[idx].d;
3926
46
          ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d);
3927
46
          int flag = 0;
3928
46
          if (exec_info->tensor_wraps_ref)
3929
32
          {
3930
32
            ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1);
3931
113
            for (k = 0; k < tensor_wrap_array->size && 
!flag88
;
k++81
)
3932
81
              flag = (tensor_wrap_array->tensor_wraps[k] && 
tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv55
);
3933
32
          }
3934
          // If none is in the flag, it need to be included in the cast.
3935
46
          if (!flag)
3936
19
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3937
46
        }
3938
53
    }
3939
91.2k
  }
3940
  // Create source / destination phony node. This is to facilitate use of compiled graph.
3941
  // Also, this is needed if you have init zero execs.
3942
6.22k
  if (source_exec_created || 
source_size > 16.18k
)
3943
132
  {
3944
132
    if (!source_exec_created)
3945
92
      graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3946
561
    for (i = 0; i < source_size; 
i++429
)
3947
429
      ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3948
6.09k
  } else {
3949
6.09k
    assert(!source_exec_created);
3950
6.09k
    assert(source_size == 1);
3951
6.09k
    graph_exec_arena->source = graph_execs[sources[0].d];
3952
6.09k
  }
3953
6.22k
  if (destination_size == 1)
3954
6.14k
    graph_exec_arena->destination = graph_execs[destinations[0].d];
3955
86
  else {
3956
86
    graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3957
1.08k
    for (i = 0; i < destination_size; 
i++995
)
3958
995
      ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3959
86
  }
3960
6.22k
  ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3961
6.22k
  ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3962
6.22k
  return graph_exec_arena;
3963
6.22k
}
3964
3965
static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3966
11
{
3967
11
  if (graph_prep->symbolic_graph == pair)
3968
4
    return graph_prep->graph;
3969
7
  int i;
3970
10
  for (i = 0; i < graph_prep->sub_prep_size; 
i++3
)
3971
7
    if (graph_prep->sub_preps[i])
3972
7
    {
3973
7
      ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
3974
7
      if (graph)
3975
4
        return graph;
3976
7
    }
3977
3
  return 0;
3978
7
}
3979
3980
static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3981
6.17k
{
3982
6.17k
  int i;
3983
6.22k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++43
)
3984
43
    if (graph_prep->sub_preps[i])
3985
42
    {
3986
42
      if (graph_prep->sub_preps[i]->symbolic_graph->pair)
3987
4
        graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
3988
42
    }
3989
6.17k
}
3990
3991
static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3992
6.22k
{
3993
6.22k
  assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
3994
6.22k
  int i;
3995
58.2k
  for (i = 0; i < graph_prep->exec_symbol_info_size; 
i++52.0k
)
3996
52.0k
  {
3997
52.0k
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags))
3998
12
      continue;
3999
52.0k
    if (graph_exec_arena->graph_execs[i].graph && 
graph_prep->exec_symbol_info[i].pair_ref32.1k
)
4000
15.8k
    {
4001
15.8k
      ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4002
15.8k
        .d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4003
15.8k
        .graph = graph_prep->symbolic_graph->pair ? 
graph_prep->symbolic_graph->pair4
:
graph_prep->symbolic_graph15.8k
,
4004
15.8k
      });
4005
15.8k
      if (pair_exec.d >= 0)
4006
587
        ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4007
15.8k
    }
4008
52.0k
  }
4009
6.27k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
4010
50
    if (graph_prep->sub_preps[i])
4011
49
      _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4012
6.22k
}
4013
4014
static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4015
6.22k
{
4016
6.22k
  int i;
4017
6.22k
  if (graph_prep->dup_breakpoints)
4018
2
  {
4019
    // Strip the const modifier only possible because it is a sub-graph.
4020
2
    ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4021
4
    for (i = 0; i < graph_prep->dup_breakpoints->rnum; 
i++2
)
4022
2
      ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i));
4023
2
    ccv_array_free(graph_prep->dup_breakpoints);
4024
2
    graph_prep->dup_breakpoints = 0;
4025
2
    graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4026
    // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4027
2
    memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4028
    // Since exec_symbol_info changed, create a new visit object.
4029
2
    assert(symbolic_graph->sources);
4030
2
    assert(symbolic_graph->destinations);
4031
2
    ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0);
4032
2
    const int source_size = symbolic_graph->sources->rnum;
4033
2
    ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0);
4034
2
    const int destination_size = symbolic_graph->destinations->rnum;
4035
4
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new2
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
4036
0
    ccv_nnc_graph_visit_free(graph_prep->visit);
4037
4
    graph_prep->visit = visit;
4038
4
    assert(graph_prep->p);
4039
2
    ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4040
2
  }
4041
32.1k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
4042
32.1k
    for (i = 0; i < node->graph_ref_size; 
i++49
)
4043
49
    {
4044
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
4045
49
      if (graph_ref >= 0)
4046
49
        _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4047
49
    }
4048
32.1k
  } ccv_nnc_graph_visit_endfor
4049
6.22k
}
4050
4051
const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4052
4053
void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4054
6.17k
{
4055
6.17k
  assert(graph_ref);
4056
6.17k
  assert(tensor_arena_ref);
4057
6.17k
  assert(graph_exec_arena_ref);
4058
6.17k
  int i;
4059
  // Cannot bind the multi-view.
4060
53.7k
  for (i = 0; i < tensor_bind_size; 
i++47.5k
)
4061
47.5k
  {
4062
47.5k
    assert(tensor_binds[i].tensor);
4063
47.5k
    assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor));
4064
47.5k
  }
4065
6.17k
  ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4066
6.17k
  _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4067
6.17k
  ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4068
6.17k
  _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4069
6.17k
  *tensor_arena_ref = tensor_arena;
4070
  // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4071
6.17k
  _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4072
  // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4073
6.17k
  _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4074
6.17k
  *graph_ref = graph_prep->graph;
4075
6.17k
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4076
6.17k
  _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4077
6.17k
  _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4078
6.17k
  *graph_exec_arena_ref = graph_exec_arena;
4079
6.17k
  _ccv_nnc_symbolic_graph_prep_free(graph_prep);
4080
6.17k
}
4081
4082
static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4083
6.22k
{
4084
  // Buffers are inherited from above, no need to dealloc.
4085
6.22k
  int i;
4086
6.27k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
4087
50
    if (tensor_arena->sub_arenas[i])
4088
49
      _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4089
6.28k
  for (i = 0; i < tensor_arena->m_tensor_idx->rnum; 
i++61
)
4090
61
  {
4091
61
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i));
4092
61
    assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv));
4093
61
    ccv_nnc_tensor_multiview_free(*mv);
4094
61
  }
4095
6.22k
  ccv_array_free(tensor_arena->tensor_metadata);
4096
6.22k
  ccv_array_free(tensor_arena->m_tensor_idx);
4097
6.22k
  if (tensor_arena->pb_vt_tensors)
4098
73
    ccfree(tensor_arena->pb_vt_tensors);
4099
6.22k
  if (tensor_arena->vt_alias_r_refs_p)
4100
73
    ccfree(tensor_arena->vt_alias_r_refs_p);
4101
6.22k
  if (tensor_arena->vt_sizes)
4102
5
    ccfree(tensor_arena->vt_sizes);
4103
6.22k
  ccfree(tensor_arena);
4104
6.22k
}
4105
4106
void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4107
83.4k
{
4108
83.4k
  assert(tensor_arena->graph_ref == (intptr_t)symbol.graph);
4109
83.4k
  assert(symbol.d < tensor_arena->vt_tensor_size);
4110
83.4k
  assert(symbol.d >= 0);
4111
  // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4112
83.4k
  int i;
4113
83.4k
  if (!tensor_arena->pb_vt_tensors)
4114
73
  {
4115
73
    tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4116
7.69k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.62k
)
4117
7.62k
      if (tensor_arena->vt_tensors[i])
4118
6.36k
        tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4119
73
  }
4120
83.4k
  if (!tensor_arena->vt_alias_r_refs_p)
4121
73
  {
4122
73
    tensor_arena->vt_alias_r_refs_p = (int*)cccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4123
73
    tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4124
7.69k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.62k
)
4125
7.62k
      if (tensor_arena->vt_alias_refs[i])
4126
565
      {
4127
565
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4128
565
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
4129
565
        ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4130
565
      }
4131
73
    int refp = 0;
4132
7.62k
    for (i = 1; i < tensor_arena->vt_tensor_size; 
i++7.55k
) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4133
7.55k
      if (tensor_arena->vt_alias_r_refs_p[i])
4134
559
        refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4135
6.99k
      else
4136
6.99k
        tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4137
7.13k
    for (i = refp; i < tensor_arena->vt_tensor_size; 
i++7.06k
)
4138
7.06k
      tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4139
7.69k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.62k
)
4140
7.62k
      if (tensor_arena->vt_alias_refs[i])
4141
565
      {
4142
565
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4143
565
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
4144
565
        const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4145
565
        assert(pos >= 0);
4146
565
        tensor_arena->vt_alias_r_refs[pos] = i;
4147
565
      }
4148
73
  }
4149
83.4k
  const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? 
tensor_arena->vt_alias_refs[symbol.d] - 11
:
symbol.d83.4k
;
4150
83.4k
  if (CCV_IS_TENSOR_VIEW(tensor))
4151
0
  {
4152
0
    assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4153
0
    assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&
4154
0
          ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||
4155
0
        (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info));
4156
0
  } else
4157
83.4k
    { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)); }
4158
83.4k
  if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d]))
4159
0
    { assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0); }
4160
83.4k
  tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4161
83.4k
  if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4162
10.9k
    
for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; 10.9k
i < tensor_arena->vt_tensor_size;
i++3
)
4163
10.9k
    {
4164
10.9k
      const int d = tensor_arena->vt_alias_r_refs[i];
4165
10.9k
      if (d < 0 || 
symbol_d + 1 != tensor_arena->vt_alias_refs[d]1.66k
) // Doesn't match, reached the end of it.
4166
10.9k
        break;
4167
3
      ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4168
3
      d_tensor->info.datatype = tensor->info.datatype;
4169
3
      d_tensor->info.reserved = tensor->info.reserved;
4170
3
      if (CCV_IS_TENSOR_VIEW(d_tensor))
4171
1
        ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4172
2
      else {
4173
2
        d_tensor->data.u8 = tensor->data.u8;
4174
2
        d_tensor->dataof = tensor->dataof;
4175
2
      }
4176
3
    }
4177
83.4k
}
4178
4179
void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4180
14.5k
{
4181
14.5k
  if (!tensor_arena->pb_vt_tensors)
4182
34
    return;
4183
14.4k
  int i;
4184
483k
  for (i = 0; i < tensor_arena->vt_tensor_size; 
i++469k
)
4185
469k
    if (tensor_arena->vt_tensors[i])
4186
295k
      tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4187
14.4k
}
4188
4189
uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4190
2
{
4191
2
  uint64_t total_size = 0;
4192
2
  int i;
4193
36
  for (i = 0; i < tensor_arena->buffer_size; 
i++34
)
4194
34
    total_size += tensor_arena->buffers[i].size;
4195
2
  return total_size;
4196
2
}
4197
4198
static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4199
0
{
4200
0
  int i;
4201
0
  if (mv->it)
4202
0
    mv->it->info = params;
4203
0
  for (i = 0; i < mv->repeat + mv->kind; i++)
4204
0
  {
4205
0
    ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i];
4206
0
    if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4207
0
      _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4208
0
    else
4209
0
      tensor->info = params;
4210
0
  }
4211
0
}
4212
4213
int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4214
2.20k
{
4215
2.20k
  int i;
4216
2.20k
  assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size);
4217
2.20k
  if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4218
5
  {
4219
5
    tensor_arena->vt_sizes = (size_t*)ccmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4220
81
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++76
)
4221
76
      if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]52
)
4222
50
      {
4223
50
        ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4224
50
        if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4225
0
        {
4226
0
          ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4227
0
          while (CCV_IS_TENSOR_MULTIVIEW(mv))
4228
0
            mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)[0]);
4229
0
          tensor = (ccv_nnc_tensor_t*)mv;
4230
0
        }
4231
50
        tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4232
50
      }
4233
5
  }
4234
2.20k
  int flag = 0;
4235
22.2k
  for (i = 0; !flag && i < tensor_arena->vt_tensor_size; 
i++20.0k
)
4236
20.0k
    if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]17.6k
)
4237
15.6k
    {
4238
15.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4239
15.6k
      ccv_nnc_tensor_param_t params = symbol_info->info;
4240
15.6k
      params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4241
15.6k
      params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4242
15.6k
      flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4243
15.6k
    }
4244
2.20k
  if (flag)
4245
0
    return -1;
4246
22.2k
  
for (i = 0; 2.20k
i < tensor_arena->vt_tensor_size;
i++20.0k
)
4247
20.0k
    if (tensor_arena->vt_tensors[i])
4248
17.6k
    {
4249
17.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4250
17.6k
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4251
17.6k
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4252
0
      {
4253
0
        assert(!tensor_arena->vt_alias_refs[i]);
4254
0
        _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4255
17.6k
      } else if (!tensor_arena->vt_alias_refs[i]) {
4256
15.6k
        ccv_nnc_tensor_param_t params = symbol_info->info;
4257
15.6k
        params.datatype = tensor->info.datatype;
4258
15.6k
        params.reserved = tensor->info.reserved;
4259
15.6k
        tensor->info = params;
4260
15.6k
      } else {
4261
2.00k
        off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4262
2.00k
        ccv_nnc_tensor_param_t params = symbol_info->info;
4263
2.00k
        params.datatype = tensor->info.datatype;
4264
2.00k
        params.reserved = tensor->info.reserved;
4265
2.00k
        tensor->info = params;
4266
2.00k
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4267
2.00k
        ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4268
2.00k
        if (CCV_IS_TENSOR_VIEW(tensor))
4269
0
        {
4270
0
          ((ccv_nnc_tensor_view_t*)tensor)->off = off;
4271
0
          memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4272
0
        }
4273
2.00k
      }
4274
17.6k
    }
4275
  // Should handle sub_tensor_arena, don't do that at the moment.
4276
2.20k
  assert(!graph->sub_graphs);
4277
2.20k
  return 0;
4278
2.20k
}
4279
4280
void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4281
2.20k
{
4282
2.20k
  assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size);
4283
2.20k
  int i;
4284
11.0k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++8.82k
)
4285
8.82k
  {
4286
8.82k
    const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4287
8.82k
    if (graph_exec.d < 0)
4288
2.41k
      continue;
4289
6.41k
    const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4290
6.41k
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
4291
6.41k
    ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4292
6.41k
    if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4293
6.41k
    {
4294
6.41k
      new_cmd.backend = existing_cmd.backend;
4295
6.41k
      new_cmd.algorithm = existing_cmd.algorithm;
4296
6.41k
    }
4297
6.41k
    ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4298
6.41k
  }
4299
2.20k
}
4300
4301
void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4302
6.38k
{
4303
6.38k
  int i;
4304
22.7k
  for (i = 0; i < tensor_arena->buffer_size; 
i++16.3k
)
4305
16.3k
  {
4306
16.3k
    if (!tensor_arena->buffers[i].ptr)
4307
248
      continue;
4308
16.0k
    const int buffer_type = tensor_arena->buffers[i].type;;
4309
16.0k
    const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
4310
16.0k
#ifdef HAVE_CUDA
4311
16.0k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
4312
16.0k
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
4313
2.35k
    {
4314
2.35k
      if (tensor_arena->allocator.isa && 
tensor_arena->allocator.isa->free266
)
4315
266
        tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4316
2.09k
      else
4317
2.09k
        cufree(device_id, tensor_arena->buffers[i].ptr);
4318
13.7k
    } else {
4319
13.7k
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4320
13.7k
      if (tensor_arena->buffers[i].pin_mem)
4321
17
        cuhostfree(tensor_arena->buffers[i].ptr);
4322
13.7k
      else
4323
13.7k
        ccfree(tensor_arena->buffers[i].ptr);
4324
13.7k
    }
4325
#elif defined(HAVE_MPS)
4326
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
4327
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
4328
    {
4329
      // if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4330
      //  tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4331
      // else
4332
      mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4333
    } else {
4334
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4335
      ccfree(tensor_arena->buffers[i].ptr);
4336
    }
4337
#else
4338
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4339
    ccfree(tensor_arena->buffers[i].ptr);
4340
#endif
4341
16.0k
    tensor_arena->buffers[i].ptr = 0;
4342
16.0k
  }
4343
  // For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4344
6.38k
  if (tensor_arena->disposers)
4345
0
  {
4346
0
    for (i = 0; i < tensor_arena->disposers->rnum; i++)
4347
0
    {
4348
0
      ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i);
4349
0
      disposer->dispose(disposer->ptr, disposer->userdata);
4350
0
    }
4351
0
    ccv_array_free(tensor_arena->disposers);
4352
0
    tensor_arena->disposers = 0;
4353
0
  }
4354
6.38k
}
4355
4356
void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4357
6.17k
{
4358
6.17k
  ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4359
6.17k
  _ccv_nnc_tensor_arena_free(tensor_arena);
4360
6.17k
}
4361
4362
void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4363
6.22k
{
4364
6.22k
  int i;
4365
6.27k
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++50
)
4366
50
    if (graph_exec_arena->sub_arenas[i])
4367
49
      ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4368
6.22k
  ccfree(graph_exec_arena);
4369
6.22k
}