Coverage Report

Created: 2022-07-27 23:53

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_symbolic_graph_compile.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#ifdef HAVE_CUDA
6
#include "gpu/ccv_nnc_compat.h"
7
#endif
8
#include "_ccv_nnc_graph.h"
9
#include "_ccv_nnc_symbolic_graph.h"
10
11
// MARK - Level-3 API
12
13
typedef struct {
14
  int flags;
15
  int type;
16
  int pin_mem; // This memory need to be pinned.
17
  int ref; // Reference to another tensor block. Start with 1.
18
  int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
19
  int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
20
  int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
21
  ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
22
  uint64_t size; // The size of the tensor expected.
23
  int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
24
  ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
25
  ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
26
  ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
27
} ccv_nnc_tensor_block_t; // Tensor Arena Block
28
29
4.85M
#define IS_PRIMARY_COMPANION(idx, block) ((idx) < (uint32_t)((block).companion_ref - 1))
30
31
enum {
32
  UNASSIGNED = 0x1,
33
  ALIAS = 0x2,
34
  READ_ONLY = 0x4,
35
  WRITE_ONLY = 0x8,
36
  READ_WRITE = 0xc,
37
  ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
38
  UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
39
  UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
40
};
41
42
#define TENSOR_EXPECT_ORDINARY(t) ((t.flags & 0x3) == 0)
43
#define TENSOR_EXPECT_SET_ORDINARY(t) (t.flags = (t.flags & ~0x3))
44
5.89M
#define TENSOR_EXPECT_UNASSIGNED(t) ((t.flags & 0x3) == UNASSIGNED)
45
6.20k
#define TENSOR_EXPECT_SET_UNASSIGNED(t) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
46
3
#define TENSOR_EXPECT_UNSET_UNASSIGNED(t) (t.flags = (t.flags & ~0x1))
47
10.2M
#define TENSOR_EXPECT_ALIAS(t) ((t.flags & 0x3) == ALIAS)
48
9.60M
#define TENSOR_EXPECT_COMPUTABLE(t) (
!4.91M
TENSOR_EXPECT_ALIAS4.91M
(t) &&
!4.69M
TENSOR_EXPECT_UNASSIGNED4.69M
(t))
49
29.4k
#define TENSOR_READ_WRITE(t) (t.flags & 0xc)
50
6.28k
#define TENSOR_SET_READ_WRITE(t, rw) (t.flags = ((t.flags & ~0xc) | rw))
51
95
#define TENSOR_SET_ANONYMOUS(t) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
52
#define TENSOR_IS_ANONYMOUS(t) (t.flags & ANONYMOUS)
53
180
#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
54
19.0k
#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t) (t.flags & UNFOLDABLE_AS_INPUT)
55
116
#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
56
12.8k
#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t) (t.flags & UNFOLDABLE_AS_OUTPUT)
57
58
122k
#define TENSOR_REQUIRE_INIT(flags) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || 
((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)122k
)
59
60
// Holds additional information about the exe nodes.
61
typedef struct {
62
  int flags;
63
} ccv_nnc_graph_exec_flag_t;
64
65
enum {
66
  CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
67
};
68
69
typedef struct {
70
  int index;
71
  int oc;
72
  int type;
73
  uint64_t size;
74
} ccv_nnc_tensor_opt_t;
75
76
// We first sort the same type together (because they won't be reused at all.
77
// And then we sort by size, after that, sort by oc.
78
250k
#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
79
250k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)
80
#undef more_than
81
82
// If b has items overlap with a, a is still after b (inclusive).
83
static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
84
0
{
85
0
  assert(a);
86
0
  assert(b);
87
0
  int x, y;
88
0
  for (x = 0; x < b->rnum; x++)
89
0
  {
90
0
    const int p = *(int*)ccv_array_get(b, x);
91
0
    int flag = 0;
92
    // In extreme cases where a is a superset of b, then a is still after b, we are good.
93
0
    for (y = 0; !flag && y < a->rnum; y++)
94
0
    {
95
0
      const int q = *(int*)ccv_array_get(a, y);
96
0
      flag = (p == q);
97
0
    }
98
0
    if (!flag)
99
0
      for (y = 0; y < a->rnum; y++)
100
0
      {
101
0
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y), p);
102
0
        if (!cell.i32 || cell.i32[0] == 0)
103
0
          return 0;
104
0
      }
105
0
  }
106
  // If b->rnum == 0, a is after b for sure.
107
  // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
108
  // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
109
0
  return (a->rnum > 0 || b->rnum == 0);
110
0
}
111
112
static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
113
1.40M
{
114
1.40M
  assert(a);
115
1.40M
  assert(b);
116
1.40M
  int x, y, max_hop = 0;
117
1.48M
  for (x = 0; x < a->rnum; 
x++80.8k
)
118
1.48M
    
for (y = 0; 1.40M
y < b->rnum;
y++80.8k
)
119
1.40M
    {
120
1.40M
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x), *(int*)ccv_array_get(b, y));
121
1.40M
      if (!cell.i32 || 
cell.i32[0] == 080.8k
)
122
1.32M
        return 0;
123
80.8k
      max_hop = ccv_max(cell.i32[0], max_hop);
124
80.8k
    }
125
  // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
126
  // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
127
75.6k
  return max_hop;
128
1.40M
}
129
130
// If every a's head is deterministically after b's tail
131
static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
132
1.40M
{
133
1.40M
  return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
134
1.40M
}
135
136
typedef struct {
137
  ccv_array_t** alloc_dep;
138
  int vt_block_size;
139
  int buffer_size;
140
  int block_size;
141
  int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
142
  struct {
143
    int type; // The type from tensor blocks.
144
    int pin_mem; // Whether this is pinned memory.
145
    int flags; // The flags (currently for READ_ONLY or not).
146
    uint64_t size; // The size of the buffer allocated.
147
    int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
148
    ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
149
  }* buffers;
150
  struct {
151
    int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
152
    int block_ref; // A reference to which block in the given tensor_block to use.
153
    uint64_t offset; // The offset of this block.
154
  }* blocks;
155
} ccv_nnc_tensor_alloc_prep_t;
156
157
typedef struct ccv_nnc_symbolic_graph_prep_s {
158
  int flags;
159
  int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
160
  int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
161
  int exec_idx;
162
  int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
163
  int tensor_symbol_info_size;
164
  int exec_symbol_info_size;
165
  int tensor_block_size;
166
  int sub_prep_size;
167
  ccv_nnc_tensor_block_t* tensor_blocks;
168
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
169
  ccv_nnc_graph_exec_flag_t* exec_flags;
170
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
171
  int* dup_tensor_block_ref;
172
  ccv_nnc_graph_visit_t* visit;
173
  ccv_nnc_tensor_alloc_prep_t* alloc_prep;
174
  struct ccv_nnc_symbolic_graph_prep_s* p;
175
  struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
176
  // Structures that don't require to be freed after deallocation.
177
  const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
178
  ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
179
  ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
180
  ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
181
} ccv_nnc_symbolic_graph_prep_t;
182
183
typedef struct {
184
  int oc;
185
  ccv_array_t* itf;
186
} ccv_nnc_tensor_block_adjacent_t;
187
188
static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
189
6.10k
{
190
  // Compute how many dis-continuous buffers are needed.
191
  // We prefer to have several dis-continuous buffers instead of one big buffer because
192
  // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
193
  // to fully utilize memory.
194
6.10k
  int i, j, k;
195
6.10k
  ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloc(tensor_block_size, sizeof(ccv_array_t*));
196
6.10k
  int allocable_tensor_size = 0, available_tensor_size = 0;
197
100k
  for (i = 0; i < tensor_block_size; 
i++94.2k
)
198
94.2k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
199
31.8k
    {
200
      // Tensors that we need the header info.
201
31.8k
      ++available_tensor_size;
202
31.8k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
203
        // Tensors that we actually need to allocate (exclude the alias).
204
29.0k
        ++allocable_tensor_size;
205
31.8k
    }
206
6.10k
  ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
207
6.10k
  ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
208
6.10k
  ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
209
  // Overlap count.
210
100k
  for (i = 0; i < tensor_block_size; 
i++94.2k
)
211
94.2k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
212
2.01M
      
for (j = i + 1; 29.0k
j < tensor_block_size;
j++1.99M
)
213
1.99M
        if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j]))
214
700k
        {
215
          // Check to see if they interfere (default to yes).
216
          // If any of the i's head is deterministically later than j's tail
217
          // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
218
700k
          const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
219
700k
          if (i_hop_j > 0)
220
238
          {
221
238
            ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
222
238
            ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
223
238
          }
224
700k
          const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
225
700k
          if (j_hop_i > 0)
226
75.4k
          {
227
75.4k
            ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
228
75.4k
            ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
229
75.4k
          }
230
          // It cannot be that both i can hop to j can j can hop to i.
231
700k
          assert(!(i_hop_j > 0 && j_hop_i > 0));
232
700k
          if (!i_hop_j && 
!j_hop_i700k
&&
tensor_blocks[i].type == tensor_blocks[j].type625k
)
233
150k
          {
234
150k
            if (!adj[i].itf)
235
4.50k
              adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
236
150k
            ccv_array_push(adj[i].itf, &j);
237
150k
            ++adj[i].oc;
238
150k
            if (!adj[j].itf)
239
24.2k
              adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
240
150k
            ccv_array_push(adj[j].itf, &i);
241
150k
            ++adj[j].oc;
242
150k
          }
243
700k
        }
244
6.10k
  int* const buf = (int*)ccmalloc(sizeof(int) * tensor_block_size);
245
6.10k
  int* const assigned = (int*)cccalloc(tensor_block_size, sizeof(int));
246
6.10k
  uint64_t* const allocated_offset = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
247
6.10k
  uint64_t* const allocated_size = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
248
6.10k
  int num_assigned = 0; 
249
  // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
250
  // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
251
  // The first channel denotes the bytes available for allocation,
252
  // the second channel denotes the offset available for the allocation,
253
6.10k
  ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
254
6.10k
  ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
255
35.1k
  for (j = 0; j < allocable_tensor_size;)
256
29.0k
  {
257
    // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
258
29.0k
    uint64_t max_size = 0;
259
29.0k
    ccv_array_clear(opt);
260
29.0k
    int current_type = 0; // Deal with one type at a time.
261
4.48M
    for (i = 0; i < tensor_block_size; 
i++4.45M
)
262
4.45M
      if (tensor_blocks[i].size >= max_size &&
263
4.45M
        
TENSOR_EXPECT_COMPUTABLE2.24M
(tensor_blocks[i]) &&
!assigned[i]1.03M
&&
264
4.45M
        
IS_PRIMARY_COMPANION398k
(i, tensor_blocks[i]) &&
265
4.45M
        
(398k
!current_type398k
||
tensor_blocks[i].type == current_type369k
))
266
134k
      {
267
134k
        ccv_nnc_tensor_opt_t a = {
268
134k
          .size = tensor_blocks[i].size,
269
134k
          .index = i,
270
134k
          .oc = adj[i].oc,
271
134k
          .type = tensor_blocks[i].type,
272
134k
        };
273
134k
        assert(a.type);
274
134k
        current_type = a.type; // Now we now the primary type we should deal with.
275
134k
        if (tensor_blocks[i].companion_ref)
276
36
        {
277
36
          const int companion_ref = tensor_blocks[i].companion_ref - 1;
278
36
          a.size = ccv_max(a.size, tensor_blocks[companion_ref].size);
279
36
          a.oc += adj[companion_ref].oc;
280
36
        }
281
        // In case we have a tie, take them all in the array.
282
134k
        if (a.size > max_size)
283
33.9k
          ccv_array_clear(opt), max_size = a.size;
284
134k
        ccv_array_push(opt, &a);
285
134k
      }
286
29.0k
    assert(opt->rnum > 0);
287
    // Order opt array by the oc because type and size should be equal at this point.
288
29.0k
    _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
289
    // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
290
29.0k
    int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep->rows * 3;
291
29.0k
    uint64_t min_val[2] = {
292
29.0k
      0, 0
293
29.0k
    };
294
108k
    for (i = 0; i < opt->rnum; 
i++79.4k
)
295
92.4k
    {
296
92.4k
      ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
297
      // Now, determine the order between a and c. After this, we can always check whether y
298
      // can hop to the earliest one and if the latest one can hop to x.
299
      // The earliest one will be called p and the latest one will be called q.
300
92.4k
      int p = a.index;
301
92.4k
      int q = a.index;
302
92.4k
      if (tensor_blocks[a.index].companion_ref)
303
33
      {
304
33
        const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
305
33
        const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
306
33
        if (b_hop_p.i32 && 
b_hop_p.i32[0] > 02
)
307
2
          p = companion_ref;
308
31
        else {
309
31
          const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
310
31
          if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
311
31
            q = companion_ref;
312
0
          else { // Otherwise, b is in between p and q.
313
0
            const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
314
0
            const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
315
0
            assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0);
316
0
          }
317
31
        }
318
33
      }
319
92.4k
      assert(tensor_blocks[q].type == tensor_blocks[p].type);
320
92.4k
      const int type = tensor_blocks[p].type;
321
      // y is always earlier than x, but this is hard to assert now.
322
      // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
323
      // Thus, the hop between y and x (through a) should be smallest ones.
324
      // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
325
      // out of q. For these nodes, we try to verify whether they form a connection (by checking against
326
      // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
327
92.4k
      int y_size = 0;
328
92.4k
      int* const y_buf = buf;
329
279k
#define for_block(y, val) do { \
330
279k
        if (((int*)val)[0] > 0 && assigned[y] && 
tensor_blocks[y].type == type121k
) \
331
279k
          
y_buf[y_size++] = y + 1121k
; \
332
279k
      } while(0)
333
92.4k
      ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
334
92.4k
      if (y_vector)
335
279k
        
CCV_SPARSE_VECTOR_FOREACH47.5k
(tensor_dt, y_vector, for_block);
336
92.4k
#undef for_block
337
92.4k
      assert(y_size <= tensor_block_size);
338
92.4k
      int x_size = 0;
339
92.4k
      int* const x_buf = buf + y_size;
340
256k
#define for_block(x, val) do { \
341
256k
        if (((int*)val)[0] > 0 && assigned[x] && 
tensor_blocks[x].type == type88.0k
) \
342
256k
          
x_buf[x_size++] = x + 187.8k
; \
343
256k
      } while(0)
344
92.4k
      ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
345
92.4k
      if (x_vector)
346
256k
        
CCV_SPARSE_VECTOR_FOREACH47.3k
(tensor_df, x_vector, for_block);
347
92.4k
#undef for_block
348
92.4k
      assert(y_size + x_size <= tensor_block_size);
349
92.4k
      int x, y;
350
213k
      for (y = 0; y < y_size; 
y++121k
)
351
121k
      {
352
121k
        const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y], tensor_block_size + 1);
353
121k
        if (val.u64 && 
val.u64[0] >= a.size18.4k
)
354
10.2k
        {
355
10.2k
          const ccv_numeric_data_t y_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, y_buf[y] - 1);
356
10.2k
          assert(y_hop_p.i32 && y_hop_p.i32[0] > 0);
357
10.2k
          const int hop = exec_dep->rows + y_hop_p.i32[0];
358
10.2k
          if (hop < min_hop)
359
6.76k
            min_y = y_buf[y], min_x = tensor_block_size + 1, min_hop = hop,
360
6.76k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
361
10.2k
        }
362
121k
      }
363
180k
      
for (x = 0; 92.4k
x < x_size;
x++87.8k
)
364
87.8k
      {
365
87.8k
        const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x]);
366
87.8k
        if (val.u64 && 
val.u64[0] >= a.size12.5k
)
367
6.17k
        {
368
6.17k
          const ccv_numeric_data_t q_hop_x = ccv_get_sparse_matrix_cell(tensor_dt, x_buf[x] - 1, q);
369
6.17k
          assert(q_hop_x.i32 && q_hop_x.i32[0] > 0);
370
6.17k
          const int hop = exec_dep->rows + q_hop_x.i32[0];
371
6.17k
          if (hop < min_hop)
372
4.85k
            min_y = 0, min_x = x_buf[x], min_hop = hop,
373
4.85k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
374
6.17k
        }
375
87.8k
      }
376
213k
      
for (y = 0; 92.4k
y < y_size;
y++121k
)
377
121k
      {
378
121k
        ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y]);
379
121k
        if (y_vector)
380
467k
          
for (x = 0; 121k
x < x_size;
x++346k
)
381
346k
          {
382
346k
            const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x]);
383
346k
            if (val.u64 && 
val.u64[0] >= a.size3.72k
)
384
3.14k
            {
385
3.14k
              const ccv_numeric_data_t y_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, y_buf[y] - 1);
386
3.14k
              const ccv_numeric_data_t q_hop_x = ccv_get_sparse_matrix_cell(tensor_dt, x_buf[x] - 1, q);
387
3.14k
              assert(y_hop_p.i32 && y_hop_p.i32[0] > 0);
388
3.14k
              assert(q_hop_x.i32 && q_hop_x.i32[0] > 0);
389
3.14k
              const int hop = y_hop_p.i32[0] + q_hop_x.i32[0];
390
3.14k
              if (hop < min_hop)
391
2.59k
                min_y = y_buf[y], min_x = x_buf[x], min_hop = hop,
392
2.59k
                  min_val[0] = val.u64[0], min_val[1] = val.u64[1];
393
3.14k
            }
394
346k
          }
395
121k
      }
396
      // If I found a place, stop, and exit.
397
92.4k
      if (min_y > 0 || 
min_x < tensor_block_size + 183.9k
)
398
13.0k
      {
399
13.0k
        min_i = i;
400
13.0k
        break;
401
13.0k
      }
402
92.4k
    }
403
    // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
404
    // and default to largest size available.
405
29.0k
    ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i));
406
29.0k
    if (min_i == -1)
407
16.0k
    {
408
16.0k
      allocated_size[num_assigned] = a.size;
409
16.0k
      ++num_assigned;
410
16.0k
    }
411
29.0k
    int assign_group = num_assigned;
412
29.0k
    if (min_y > 0)
413
8.50k
    {
414
8.50k
      assign_group = assigned[min_y - 1];
415
      // The y and x should belong to the same assigned group.
416
8.50k
      assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group);
417
20.5k
    } else if (min_x < tensor_block_size + 1)
418
4.49k
      assign_group = assigned[min_x - 1];
419
    // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
420
29.0k
    if (min_y != 0 || 
min_x != tensor_block_size + 120.5k
)
421
13.0k
    {
422
13.0k
      uint64_t val[2] = {
423
13.0k
        min_val[0], min_val[1]
424
13.0k
      };
425
13.0k
      assert(val[0] >= a.size);
426
13.0k
      val[0] -= a.size;
427
13.0k
      val[1] = val[1] + a.size; // Move the offset to the next one.
428
13.0k
      ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
429
13.0k
    }
430
29.0k
    int strings[3];
431
29.0k
    strings[0] = a.index + 1;
432
29.0k
    int string_size = 1;
433
    // Assign out designated companion if it exist.
434
29.0k
    if (tensor_blocks[a.index].companion_ref)
435
20
    {
436
20
      const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
437
20
      assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type);
438
20
      const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
439
20
      if (b_hop_p.i32 && 
b_hop_p.i32[0] > 02
)
440
2
      {
441
4
        for (i = 0; i < string_size; 
i++2
)
442
2
          strings[i + 1] = strings[i];
443
2
        strings[0] = companion_ref + 1;
444
18
      } else {
445
18
        const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
446
18
        if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
447
18
          strings[string_size] = companion_ref + 1;
448
0
        else {
449
          // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
450
0
          assert(string_size == 2);
451
0
          strings[2] = strings[1];
452
0
          strings[1] = companion_ref + 1;
453
0
        }
454
18
      }
455
20
      ++string_size;
456
20
    }
457
    // Assign out and update oc.
458
58.0k
    
for (i = 0; 29.0k
i < string_size;
i++29.0k
)
459
29.0k
    {
460
29.0k
      const int index = strings[i] - 1;
461
      // Assign out the selected one.
462
29.0k
      assigned[index] = assign_group;
463
      // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
464
29.0k
      allocated_offset[index] = min_val[1];
465
29.0k
      if (adj[index].itf)
466
329k
        
for (k = 0; 28.7k
k < adj[index].itf->rnum;
k++301k
)
467
301k
        {
468
301k
          const int d = *(int*)ccv_array_get(adj[index].itf, k);
469
301k
          if (!assigned[d] && 
TENSOR_EXPECT_COMPUTABLE150k
(tensor_blocks[d]))
470
150k
            --adj[d].oc;
471
301k
        }
472
29.0k
    }
473
29.0k
    uint64_t val[2] = {
474
29.0k
      a.size, min_val[1]
475
29.0k
    };
476
29.0k
    uint64_t consumed_size = 0;
477
    // Go over from min_y to string_size (excluding min_x).
478
29.0k
    for (i = 0; i < string_size; 
i++0
)
479
29.0k
    {
480
29.0k
      const uint64_t size = tensor_blocks[strings[i] - 1].size;
481
29.0k
      assert(size <= a.size);
482
      // Update consumed size if it is bigger than "size".
483
29.0k
      if (size > consumed_size)
484
29.0k
      {
485
29.0k
        val[0] = size - consumed_size;
486
29.0k
        ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
487
29.0k
        consumed_size = size;
488
29.0k
        val[1] = min_val[1] + consumed_size;
489
29.0k
      }
490
      // If it consumed all the flow, break out.
491
29.0k
      if (consumed_size == a.size)
492
29.0k
        break;
493
29.0k
    }
494
58.0k
    
for (i = 0; 29.0k
i < string_size;
i++29.0k
)
495
29.0k
    {
496
29.0k
      const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
497
29.0k
      uint64_t val[2] = {
498
29.0k
        i_size, min_val[1]
499
29.0k
      };
500
29.0k
      uint64_t consumed_size = 0;
501
29.0k
      for (k = i + 1; k < string_size; 
k++0
)
502
20
      {
503
20
        const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size);
504
        // Update consumed size if it is bigger than "size".
505
20
        if (size > consumed_size)
506
20
        {
507
20
          val[0] = size - consumed_size;
508
20
          ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
509
20
          consumed_size = size;
510
20
          val[1] = min_val[1] + consumed_size;
511
20
        }
512
        // If it consumed all the flow, break out.
513
20
        if (consumed_size == i_size)
514
20
          break;
515
20
      }
516
29.0k
      val[0] = i_size - consumed_size;
517
      // Still have residual, flow it to min_x.
518
29.0k
      if (val[0] > 0)
519
29.0k
        ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
520
29.0k
    }
521
29.0k
    j += string_size;
522
29.0k
  }
523
6.10k
  ccfree(buf);
524
6.10k
  ccv_array_free(opt);
525
6.10k
  ccv_matrix_free(tensor_df);
526
6.10k
  ccv_matrix_free(tensor_dt);
527
58.0k
#define for_block(y, x, val) do { \
528
58.0k
    if (((uint64_t*)val)[0] > 0 && 
y > 046.0k
&&
x < tensor_block_size + 129.7k
) \
529
58.0k
    { \
530
13.1k
      if (!alloc_dep[x - 1]) \
531
13.1k
        
alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0)12.8k
; \
532
13.1k
      ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
533
13.1k
    } \
534
58.0k
  } while (0)
535
58.0k
  
CCV_SPARSE_FOREACH6.10k
(alloc, for_block);
536
6.10k
#undef for_block
537
6.10k
  ccv_matrix_free(alloc);
538
100k
  for (i = 0; i < tensor_block_size; 
i++94.2k
)
539
94.2k
    if (adj[i].itf)
540
28.7k
      ccv_array_free(adj[i].itf);
541
6.10k
  ccfree(adj);
542
6.10k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
543
6.10k
  alloc_prep->alloc_dep = alloc_dep;
544
6.10k
  alloc_prep->vt_block_size = tensor_block_size;
545
6.10k
  alloc_prep->buffer_size = num_assigned;
546
6.10k
  alloc_prep->block_size = available_tensor_size;
547
6.10k
  alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
548
6.10k
  alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
549
6.10k
  alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
550
6.10k
  memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
551
22.1k
  for (i = 0; i < num_assigned; 
i++16.0k
)
552
16.0k
    alloc_prep->buffers[i].size = allocated_size[i];
553
6.10k
  ccfree(allocated_size);
554
6.10k
  j = 0;
555
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
556
100k
  for (i = 0; i < tensor_block_size; 
i++94.2k
)
557
94.2k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
558
31.8k
    {
559
31.8k
      alloc_prep->blocks[j].block_ref = i;
560
31.8k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
561
29.0k
      {
562
29.0k
        alloc_prep->vt_blocks[i] = j;
563
        // Also, set its allocations.
564
29.0k
        assert(assigned[i] > 0);
565
29.0k
        const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
566
29.0k
        alloc_prep->blocks[j].offset = allocated_offset[i];
567
29.0k
        if (!alloc_prep->buffers[buffer_ref].type)
568
16.0k
          alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
569
29.0k
        alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || 
tensor_blocks[i].pin_mem29.0k
;
570
29.0k
        alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i]);
571
29.0k
        assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size);
572
29.0k
      } else {
573
2.81k
        alloc_prep->vt_blocks[i] = -1;
574
2.81k
        alloc_prep->blocks[j].buffer_ref = -1;
575
2.81k
        alloc_prep->blocks[j].offset = 0;
576
2.81k
      }
577
31.8k
      ++j;
578
31.8k
    } else
579
62.3k
      alloc_prep->vt_blocks[i] = -1;
580
6.10k
  ccfree(allocated_offset);
581
6.10k
  ccfree(assigned);
582
6.10k
  return alloc_prep;
583
6.10k
}
584
585
static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
586
6.10k
{
587
6.10k
  int i;
588
100k
  for (i = 0; i < alloc_prep->vt_block_size; 
i++94.2k
)
589
94.2k
    if (alloc_prep->alloc_dep[i])
590
12.8k
      ccv_array_free(alloc_prep->alloc_dep[i]);
591
22.1k
  for (i = 0; i < alloc_prep->buffer_size; 
i++16.0k
)
592
16.0k
    if (alloc_prep->buffers[i].dup_p_refs)
593
13
      ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
594
6.10k
  ccfree(alloc_prep->alloc_dep);
595
6.10k
  ccfree(alloc_prep);
596
6.10k
}
597
598
// Simple allocator from ccv_array_t.
599
static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
600
78.4k
{
601
78.4k
  int pos = tensor_metadata->rnum;
602
78.4k
  int rsize = (size + 15) / 16;
603
78.4k
  ccv_array_resize(tensor_metadata, pos + rsize);
604
78.4k
  return (pos << 1) + 1;
605
78.4k
}
606
607
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
608
166k
{
609
166k
  assert((pos >> 1) < tensor_metadata->rnum);
610
166k
  return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1);
611
166k
}
612
613
85.1k
#define CCV_NNC_IS_METADATA_POS(ptr) ((uintptr_t)(
ptr554
) & 1)
614
615
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
616
84.7k
{
617
  // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
618
84.7k
  if (!CCV_NNC_IS_METADATA_POS(vt_tensor))
619
0
    return vt_tensor;
620
84.7k
  ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
621
84.7k
  if (tensor->alias_ref && 
CCV_NNC_IS_METADATA_POS97
(tensor->alias_ref))
622
83
  {
623
83
    const int alias_ref = tensor->alias_ref;
624
83
    tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
625
83
    _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
626
83
  }
627
84.7k
  if (CCV_IS_TENSOR_MULTIVIEW(tensor))
628
75
  {
629
75
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
630
75
    int i;
631
75
    const int count = mv->kind + mv->repeat;
632
240
    for (i = 0; i < count; 
i++165
)
633
165
    {
634
165
      if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
635
147
      {
636
147
        const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i];
637
147
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
638
147
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
639
147
      }
640
165
    }
641
    // No need to recursively do parent pointer, otherwise we are in deep rewire.
642
75
    if (mv->p && 
CCV_NNC_IS_METADATA_POS9
(mv->p))
643
9
      mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
644
75
    if (mv->sp)
645
65
      
for (i = 0; 28
i < mv->sp->rnum;
i++37
)
646
37
      {
647
37
        ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
648
37
        if (CCV_NNC_IS_METADATA_POS(*tensor))
649
30
        {
650
30
          const int pos = (int)(intptr_t)*tensor;
651
30
          *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
652
30
          assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
653
30
          _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
654
30
        }
655
37
      }
656
75
  }
657
84.7k
  return tensor;
658
84.7k
}
659
660
typedef struct {
661
  const uint8_t* ptr;
662
  int pos;
663
} ccv_nnc_tensor_block_pos_t;
664
665
static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
666
114
{
667
114
  int i;
668
114
  int unref_block_ref = block_ref;
669
117
  while (prep->tensor_blocks[unref_block_ref].ref)
670
3
    unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
671
114
  int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
672
114
  assert(vt_ref >= 0);
673
114
  assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref);
674
114
  const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
675
114
  uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
676
114
  int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
677
114
  for (i = idx - 1; i >= 0; 
i--0
)
678
114
  {
679
114
    assert(p_ref >= 0);
680
114
    const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
681
114
    const int unroll_count = graph_prep->unroll_count;
682
114
    if (ch[i]) // Prefer the dup side of things.
683
12
      p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
684
114
    int unref_p_ref = p_ref;
685
114
    while (graph_prep->tensor_blocks[unref_p_ref].ref)
686
0
      unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
687
114
    vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
688
114
    const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
689
114
    offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
690
    // If the buffer already exists, prefer that.
691
114
    const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
692
114
    if (ptr)
693
114
    {
694
      // If I have any remaining path that is not covered from 0, I cannot possibly
695
      // have any pointer from buffer (that can only happen if it is not dup).
696
138
      for (--i; i >= 0; 
i--24
)
697
24
        if (ch[i] != 0)
698
0
          return 0;
699
      // Try to find the created tensor block pos in the array, just linear scan.
700
114
      const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
701
114
      ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
702
114
      *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr + offset, params, 0);
703
114
      return tv_pos;
704
114
    }
705
0
    p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
706
0
  }
707
0
  return 0;
708
114
}
709
710
// Descent from root to the prep level, and compose multiview from there.
711
static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
712
114
{
713
114
  assert(pos_ref);
714
114
  int i;
715
114
  const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
716
114
  const int unroll_count = prep->unroll_count;
717
114
  if (prep == graph_prep)
718
57
  {
719
57
    const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
720
57
    if (!data_pos)
721
0
      return -1;
722
    // Based on ch, go all the way back to find the exact pointer to compose.
723
57
    if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
724
57
      prep->dup_tensor_block_ref &&
725
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] >= 041
&&
726
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref41
)
727
41
    {
728
41
      int pos[unroll_count + 1];
729
41
      pos[0] = data_pos;
730
98
      for (i = 0; i < unroll_count; 
i++57
)
731
57
        pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
732
41
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
733
41
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
734
41
      ccv_nnc_tensor_t* data[unroll_count + 1];
735
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
736
98
        data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
737
41
      ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
738
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
739
98
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
740
41
      *pos_ref = mv_pos;
741
41
    } else {
742
16
      *pos_ref = data_pos;
743
16
    }
744
57
    if (preserve)
745
5
    {
746
      // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
747
      // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
748
      // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
749
      // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
750
      // arena allocated).
751
      // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
752
      // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
753
      // it to a K01 structure.
754
      // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
755
      // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
756
      // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
757
5
      int prev_mv_pos = *pos_ref;
758
5
      if (prev_mv_pos == -1)
759
0
      {
760
0
        prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
761
0
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
762
0
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
763
0
        ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
764
0
          tv,
765
0
        }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
766
0
        CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
767
0
      }
768
5
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
769
5
      ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
770
5
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
771
5
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
772
5
        CCV_NNC_TENSOR_PLACEHOLDER,
773
5
        (ccv_nnc_tensor_t*)prev_mv,
774
5
      }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
775
5
      prev_mv->p = (void*)(intptr_t)mv_pos;
776
5
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
777
5
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
778
5
      *pos_ref = mv_pos;
779
5
    }
780
57
    return 0;
781
57
  }
782
57
  ch[idx] = 0;
783
57
  int pos[unroll_count + 1];
784
57
  pos[0] = 0;
785
57
  const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
786
57
  assert(retval == 0);
787
67
  
for (i = 0; 57
i < unroll_count;
i++10
)
788
10
  {
789
10
    ch[idx] = i + 1;
790
10
    pos[i + 1] = 0;
791
10
    const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
792
10
    if (dup_retval < 0)
793
0
    {
794
0
      assert(i == 0);
795
0
      break;
796
0
    }
797
10
  }
798
  // If current prep has no dup.
799
57
  if (i == 0)
800
47
  {
801
47
    *pos_ref = pos[0];
802
47
    return 0;
803
47
  }
804
10
  ccv_nnc_tensor_t* data[unroll_count + 1];
805
  // Compose to a new multiview.
806
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
807
20
    { assert(pos[i] > 0); }
808
10
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
809
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
810
20
    data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
811
10
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
812
10
  ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
813
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
814
20
    if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER && CCV_IS_TENSOR_MULTIVIEW(data[i]))
815
4
      ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
816
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
817
20
    CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
818
10
  *pos_ref = mv_pos;
819
10
  return 0;
820
10
}
821
822
static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
823
312
{
824
312
  int i;
825
312
  int is_input = 0;
826
312
  assert(node);
827
766
  
for (i = 0; 312
i < node->input_size &&
!is_input529
;
i++454
)
828
454
    if (p_ref == node->inputs[i])
829
153
      is_input = 1;
830
312
  int is_output = 0;
831
725
  for (i = 0; i < node->output_size && 
!is_output465
;
i++413
)
832
413
    if (p_ref == node->outputs[i])
833
167
      is_output = 1;
834
  // Prefer it is an output if it is both the input and the output.
835
312
  if (is_output)
836
167
    return 1;
837
145
  if (is_input)
838
145
    return -1;
839
0
  return 0;
840
145
}
841
842
static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
843
61
{
844
  // No need to check whether to preserve if this is not a while loop.
845
61
  if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
846
8
    return 0;
847
53
  assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size);
848
  // If it is unassigned, no need to preserve.
849
53
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
850
2
    return 0;
851
51
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
852
  // If p is not input, no need to preserve at all.
853
51
  if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
854
19
    return 0;
855
32
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
856
32
  assert(vt_ref >= 0);
857
32
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
858
32
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
859
  // If the buffer is a truly read-only one, no need to preserve.
860
32
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
861
6
    return 0;
862
  /* This needs detailed explanation, what does preserve mean?
863
   * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
864
   * also used outside of the while loop, we cannot reuse the memory region of x for
865
   * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
866
   * y uses the same memory region as x). The way to workaround this is by using a different
867
   * memory region for y = x + 1, but for the first iteration, having x pointing to the
868
   * original. During the allocation process, the way to identify whether x should preserve
869
   * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
870
   * parent tensor is the same as the memory region it plans to use in the buffer, then we are
871
   * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
872
   * it is the input tensor whenever that is possible. A tensor block can point to two parent
873
   * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
874
   * tensor whenever that is possible. */
875
26
  if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
876
15
    return 0;
877
  // Otherwise, return 1 because we now need to preserve.
878
11
  return 1;
879
26
}
880
881
static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
882
58
{
883
58
  assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size);
884
  // If it is unassigned, no need to preserve.
885
58
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
886
0
    return 0;
887
  // Only tape var need to force broadcast, otherwise we already share the same memory region.
888
58
  if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
889
54
    return 0;
890
4
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
891
  // If p is not output, no need to broadcast at all.
892
4
  if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
893
3
    return 0;
894
1
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
895
1
  assert(vt_ref >= 0);
896
1
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
897
1
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
898
  // If the buffer is a truly read-only one, no need to broadcast.
899
1
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
900
0
    return 0;
901
  // Otherwise, return 1 because we now need to force broadcast for this tape var.
902
1
  return 1;
903
1
}
904
905
static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
906
25
{
907
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
908
25
  int i;
909
78
  for (i = 0; i < mv->kind + mv->repeat; 
i++53
)
910
53
    if (CCV_NNC_MULTIVIEW_DATA(mv)[i] == CCV_NNC_TENSOR_PLACEHOLDER)
911
8
      CCV_NNC_MULTIVIEW_DATA(mv)[i] = tensor;
912
45
    else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
913
7
      _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], tensor);
914
25
}
915
916
static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
917
25
{
918
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
919
25
  int i;
920
25
  if (mv->sp)
921
8
    
for (i = 0; 2
i < mv->sp->rnum;
i++6
)
922
6
    {
923
6
      ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
924
6
      if (CCV_NNC_IS_METADATA_POS(*tensor))
925
1
      {
926
1
        const int pos = (int)(intptr_t)*tensor;
927
1
        *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
928
1
        assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
929
1
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
930
1
      }
931
6
    }
932
78
  
for (i = 0; 25
i < mv->kind + mv->repeat;
i++53
)
933
53
  {
934
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]))
935
8
      CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
936
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref))
937
0
      CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref);
938
53
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
939
7
      _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
940
53
  }
941
25
}
942
943
static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
944
47
{
945
  // Go to the root of the graph.
946
47
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
947
47
  int i;
948
104
  for (i = 1; prep->p; 
i++57
)
949
57
    prep = prep->p;
950
  // Root graph should have no dup tensor blocks.
951
47
  assert(!prep->dup_tensor_block_ref);
952
47
  const int c = i;
953
47
  const ccv_nnc_symbolic_graph_prep_t* preps[c];
954
47
  prep = graph_prep;
955
47
  preps[c - 1] = prep;
956
104
  for (i = 0; prep->p; 
i++57
)
957
57
    preps[c - 2 - i] = prep = prep->p;
958
47
  int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
959
47
  memset(ch, 0, sizeof(int) * c);
960
47
  int pos = 0;
961
47
  _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
962
47
  assert(ch[c - 1] == 0); // This shouldn't never be modified.
963
47
  assert(pos > 0);
964
47
  return pos;
965
47
}
966
967
static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
968
3
{
969
3
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
970
3
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
971
3
  ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : 
tensor0
;
972
3
  ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
973
3
    CCV_NNC_TENSOR_PLACEHOLDER,
974
3
    tv,
975
3
  }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
976
3
  CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
977
3
  CCV_NNC_MULTIVIEW_DATA(mv)[1] = tensor;
978
3
  return mv_pos;
979
3
}
980
981
static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
982
30
{
983
30
  ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
984
30
  const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr);
985
30
  if (!is_multiview)
986
18
    return pos;
987
24
  
while (12
CCV_IS_TENSOR_MULTIVIEW(tensor_ptr))
988
12
  {
989
12
    const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
990
12
    tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
991
12
  }
992
12
  const ccv_nnc_tensor_t tensor = *tensor_ptr;
993
12
  const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
994
12
  ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
995
12
  *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
996
12
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
997
12
  new_tensor->alias_ref = (uintptr_t)pos;
998
12
  ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
999
12
  return new_pos;
1000
30
}
1001
1002
static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1003
6.10k
{
1004
  // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1005
  // Each tensor have the designation in assigned array, and offset in allocated_offset.
1006
6.10k
  const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1007
6.10k
  ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1008
6.10k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1009
6.10k
  const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1010
6.10k
  const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1011
6.10k
  const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? 
p_graph_prep->alloc_prep49
:
06.05k
;
1012
6.10k
  const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1013
6.10k
  const int unroll_count = graph_prep->unroll_count;
1014
6.10k
  int i, j;
1015
100k
  for (i = 0; i < tensor_symbol_info_size; 
i++94.0k
)
1016
94.0k
    
for (j = 0; 94.0k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) &&
j < unroll_count62.3k
;
j++7
)
1017
7
    {
1018
7
      const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1019
7
      if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref]))
1020
3
        TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i]);
1021
7
    }
1022
6.10k
  ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1023
6.10k
  graph_prep->tensor_arena = tensor_arena;
1024
6.10k
  tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1025
6.10k
  tensor_arena->buffers = (void*)(tensor_arena + 1);
1026
6.10k
  tensor_arena->buffer_size = alloc_prep->buffer_size;
1027
6.10k
  tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1028
6.10k
  tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1029
6.10k
  tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1030
6.10k
  tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1031
6.10k
  tensor_arena->pb_vt_tensors = 0;
1032
6.10k
  tensor_arena->vt_alias_r_refs_p = 0;
1033
6.10k
  tensor_arena->vt_alias_r_refs = 0;
1034
6.10k
  tensor_arena->vt_sizes = 0;
1035
6.10k
  tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1036
6.10k
  tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
1037
6.10k
  tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1038
6.10k
  tensor_arena->allocator.context.free = allocator.context.free;
1039
6.10k
  tensor_arena->allocator.isa = allocator.isa;
1040
  // Copy alias_ref info back to the tensor arena.
1041
100k
  for (i = 0; i < tensor_symbol_info_size; 
i++94.0k
)
1042
94.0k
    tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1043
  // Do the buffer copies.
1044
22.1k
  for (i = 0; i < alloc_prep->buffer_size; 
i++16.0k
)
1045
16.0k
    tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1046
16.0k
      tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1047
16.0k
      tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1048
6.10k
  if (graph_prep->while_count_tensor)
1049
19
  {
1050
    // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1051
19
    int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1052
19
    assert((0 << 1) + 1 == pos); // pos must be 0 position.
1053
19
    ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1054
19
    *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1055
19
  }
1056
6.10k
  assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep));
1057
6.10k
  if (p_arena && 
p_graph_prep49
)
1058
49
  {
1059
    // Don't need to allocate the actual buffer, just use the pointer from the above.
1060
49
    PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena);
1061
229
    for (i = 0; i < tensor_arena->buffer_size; 
i++180
)
1062
180
    {
1063
180
      const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1064
180
      int unref_p_ref = p_ref;
1065
182
      while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1066
2
        unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1067
180
      assert(unref_p_ref >= 0);
1068
180
      const int p_unroll_count = p_graph_prep->unroll_count;
1069
180
      if (p_graph_prep->dup_tensor_block_ref &&
1070
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 016
&&
1071
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref16
)
1072
10
      {
1073
        // This condition means in the parent graph, we point to multiple tensor blocks for the same
1074
        // buffer, therefore, we cannot have one single pointer assigned in this case.
1075
        // Later we will handle this by generate ccv_tensor_multiview_t structure.
1076
10
        tensor_arena->buffers[i].ptr = 0;
1077
10
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1078
10
        continue;
1079
10
      }
1080
      // Otherwise, find the actual buffer pointer.
1081
170
      const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1082
170
      assert(vt_ref >= 0);
1083
170
      const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1084
170
      if (!p_arena->buffers[buffer_ref].ptr)
1085
0
      {
1086
        // Pass it down as 0 ptr.
1087
0
        tensor_arena->buffers[i].ptr = 0;
1088
0
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1089
0
        continue;
1090
0
      }
1091
170
      const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1092
170
      tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1093
170
      PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset);
1094
170
    }
1095
6.05k
  } else {
1096
    // Now, allocate actual buffers.
1097
6.05k
    PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena);
1098
21.9k
    for (i = 0; i < tensor_arena->buffer_size; 
i++15.8k
)
1099
15.8k
    {
1100
15.8k
      const int buffer_type = tensor_arena->buffers[i].type;
1101
15.8k
      const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
1102
15.8k
#ifdef HAVE_CUDA
1103
15.8k
      if (memory_type == CCV_TENSOR_GPU_MEMORY)
1104
2.44k
      {
1105
2.44k
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
1106
2.44k
        if (allocator.isa && 
allocator.isa->alloc268
)
1107
268
          tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1108
2.18k
        else
1109
2.18k
          tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1110
2.44k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1111
13.4k
      } else {
1112
13.4k
        assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1113
13.4k
        if (tensor_arena->buffers[i].pin_mem)
1114
11
          tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1115
13.3k
        else
1116
13.3k
          ccmemalign((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size);
1117
13.4k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1118
13.4k
      }
1119
#else
1120
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1121
      ccmemalign((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size);
1122
      PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1123
#endif
1124
15.8k
      assert(tensor_arena->buffers[i].ptr);
1125
15.8k
    }
1126
6.05k
  }
1127
  // Go over sub_preps and allocate arenas for them. Do it this early because
1128
  // we may reference tensors from sub arenas, the reason why we need to reference
1129
  // tensors from sub arenas is because for output tensors, sub arena's tensor
1130
  // will have automatic reference updates.
1131
6.15k
  
for (i = 0; 6.10k
i < tensor_arena->sub_arena_size;
i++50
)
1132
50
    if (graph_prep->sub_preps[i])
1133
49
      tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1134
1
    else
1135
1
      tensor_arena->sub_arenas[i] = 0;
1136
6.10k
  memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1137
  // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1138
6.10k
  ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? 
(ccv_nnc_tensor_t**)29
cccalloc29
(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) :
06.07k
;
1139
6.15k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1140
50
    if (tensor_arena->sub_arenas[i])
1141
49
    {
1142
49
      assert(graph_prep->sub_preps[i]);
1143
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1144
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1145
49
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1146
45
        
for (j = 0; 21
j < node->output_size;
j++24
)
1147
24
        {
1148
24
          const int idx = node->outputs[j];
1149
24
          const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i) - 1;
1150
24
          assert(s_idx >= 0);
1151
24
          ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1152
24
          assert(sub_arena_out_tensors[idx] == 0);
1153
24
          ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1154
          // Only assign if it is a multiview tensor.
1155
24
          if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor) ||
1156
24
            
(8
sub_alias8
&&
CCV_IS_TENSOR_MULTIVIEW1
(sub_alias)))
1157
17
            sub_arena_out_tensors[idx] = sub_tensor;
1158
24
        }
1159
49
    }
1160
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
1161
100k
  
for (i = 0; 6.10k
i < tensor_symbol_info_size;
i++94.0k
)
1162
94.0k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
1163
28.9k
    {
1164
28.9k
      const int vt_ref = alloc_prep->vt_blocks[i];
1165
28.9k
      const int buffer_ref = vt_ref >= 0 ? 
alloc_prep->blocks[vt_ref].buffer_ref28.9k
:
-13
;
1166
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1167
      // previous layer, therefore, cannot really find the buffer ptr.
1168
28.9k
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[i]103
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1169
28.9k
        
(28.9k
(28.9k
graph_prep->dup_tensor_block_ref28.9k
&&
1170
28.9k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] >= 059
&&
1171
28.9k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] != i57
) ||
1172
28.9k
         
(28.8k
buffer_ref >= 028.8k
&&
!tensor_arena->buffers[buffer_ref].ptr28.8k
)))
1173
47
      {
1174
47
        assert(graph_prep->p); // This must be in a sub-graph.
1175
        // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1176
47
        if (graph_prep->tensor_blocks[i].p_refs[0] && 
_ccv_nnc_tensor_block_check_preserve(graph_prep, i)36
)
1177
4
          continue;
1178
43
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1179
43
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1180
43
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1181
28.8k
      } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])) {
1182
        // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1183
28.8k
        const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1184
        // If already created, use the same tensor, and continue.
1185
        // Having ptr.
1186
28.8k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1187
28.8k
        ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1188
        // Also, set its allocations.
1189
        // Since tensor view is bit compatible with tensor, we can just cast.
1190
28.8k
        *tensor = ccv_nnc_tensor(tensor_arena->buffers[buffer_ref].ptr + offset, tensor_symbol_info[i].info, 0);
1191
28.8k
        assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size);
1192
        // If we need to force broadcast, we need to wrap it in a multiview.
1193
28.8k
        if (graph_prep->tensor_blocks[i].p_refs[0] &&
1194
28.8k
          
_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i)58
)
1195
1
        {
1196
1
          const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1197
1
          ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1198
1
          ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1199
1
          ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1200
1
            tv,
1201
1
          }, 0, 1, graph_prep->graph, mv);
1202
1
          CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1203
1
          pos = mv_pos;
1204
1
          ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1205
1
        }
1206
28.8k
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1207
28.8k
      }
1208
28.9k
    }
1209
  // Handle binded tensors. First handle cases without aliases.
1210
53.6k
  
for (i = 0; 6.10k
i < tensor_bind_size;
i++47.5k
)
1211
47.5k
  {
1212
47.5k
    assert(tensor_binds[i].tensor);
1213
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1214
47.5k
    if (resolved_symbol.d >= 0)
1215
47.5k
    {
1216
47.5k
      int d = resolved_symbol.d;
1217
47.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1218
1.02k
        continue;
1219
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1220
      // It has nothing to do with alias.
1221
46.7k
      
while (46.5k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1222
146
        d = tensor_blocks[d].ref - 1;
1223
      // For binded tensors, it shouldn't be assigned yet.
1224
      // If it is assigned, the pointer should match the ones from the binded tensor.
1225
      // This can only happen if an enforced in-place tensor is binded twice. If that
1226
      // happens, we need to make sure it is binded to the same location.
1227
46.5k
      assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1228
      // See above assertion.
1229
46.5k
      if (tensor_arena->vt_tensors[d])
1230
0
        continue;
1231
46.5k
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1232
0
      {
1233
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1234
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1235
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1236
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1237
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1238
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1239
0
        if (ccv_nnc_dimension_count(otv->inc) > 0)
1240
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1241
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->inc[j]); }
1242
0
        else // if it doesn't have inc, it is OK to be just as a whole smaller or equal to the binded one.
1243
0
          { assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)); }
1244
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1245
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1246
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1247
46.5k
      } else {
1248
46.5k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1249
46.5k
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1250
46.5k
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.ptr, tensor_symbol_info[d].info, 0);
1251
46.5k
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1252
46.5k
      }
1253
46.5k
    }
1254
47.5k
  }
1255
  // Handle binded tensors. We handle alias here so it can reference to binded tensors.
1256
53.6k
  
for (i = 0; 6.10k
i < tensor_bind_size;
i++47.5k
)
1257
47.5k
  {
1258
47.5k
    assert(tensor_binds[i].tensor);
1259
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1260
47.5k
    if (resolved_symbol.d >= 0)
1261
47.5k
    {
1262
47.5k
      int d = resolved_symbol.d;
1263
47.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1264
1.02k
        d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1265
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1266
      // It has nothing to do with alias.
1267
47.7k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1268
146
        d = tensor_blocks[d].ref - 1;
1269
47.5k
      if (tensor_arena->vt_tensors[d])
1270
47.5k
        continue;
1271
      // Assert original alias has no ofs. Otherwise our binding will be problematic.
1272
26
      
for (j = 0; 2
j < CCV_NNC_MAX_DIM_ALLOC;
j++24
)
1273
24
        { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0); }
1274
2
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1275
0
      {
1276
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1277
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1278
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1279
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1280
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1281
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1282
0
        if (ccv_nnc_dimension_count(otv->inc) > 0)
1283
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1284
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->inc[j]); }
1285
0
        else // if it doesn't have inc, it is OK to be just as a whole smaller or equal to the binded one.
1286
0
          { assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)); }
1287
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1288
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1289
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1290
2
      } else {
1291
2
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1292
2
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1293
2
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.ptr, tensor_symbol_info[d].info, 0);
1294
2
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1295
2
      }
1296
2
    }
1297
47.5k
  }
1298
  // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1299
100k
  
for (i = 0; 6.10k
i < tensor_symbol_info_size;
i++94.0k
)
1300
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1301
94.0k
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_blocks[i].ref62.3k
&&
!tensor_arena->vt_tensors[i]6.18k
)
1302
6.18k
    {
1303
6.18k
      int ref = tensor_blocks[i].ref - 1;
1304
6.18k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
tensor_blocks[ref].ref149
)
1305
1
        ref = tensor_blocks[ref].ref - 1;
1306
6.18k
      assert(tensor_arena->vt_tensors[ref]);
1307
6.18k
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1308
6.18k
    }
1309
  // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1310
6.10k
  if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1311
21
  {
1312
21
    assert(graph_prep->p);
1313
21
    const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1314
21
    const int p_idx = graph_prep->p_idx - 1;
1315
46
    for (i = 0; i < node->input_size; 
i++25
)
1316
25
    {
1317
25
      const int idx = node->inputs[i];
1318
25
      int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx) - 1;
1319
25
      assert(!tensor_blocks[block_ref].ref);
1320
25
      const int vt_ref = alloc_prep->vt_blocks[block_ref];
1321
25
      if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1322
18
        continue;
1323
7
      assert(vt_ref >= 0);
1324
7
      const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1325
7
      assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]));
1326
7
      assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]));
1327
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1328
      // previous layer, therefore, cannot really find the buffer ptr.
1329
7
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[block_ref]0
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1330
7
        ((graph_prep->dup_tensor_block_ref &&
1331
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 04
&&
1332
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref4
) ||
1333
7
         
!tensor_arena->buffers[buffer_ref].ptr3
))
1334
4
      {
1335
        // We haven't allocated anything for this yet.
1336
4
        assert(tensor_arena->vt_tensors[block_ref] == 0);
1337
4
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1338
4
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1339
4
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1340
4
      } else {
1341
3
        const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1342
3
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1343
3
        ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1344
3
      }
1345
7
    }
1346
21
  }
1347
  // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1348
  // This created the multi-view tensor to achieve that.
1349
100k
  
for (i = 0; 6.10k
i < tensor_symbol_info_size;
i++94.0k
)
1350
94.0k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1351
10
    {
1352
10
      const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1353
      // Create phi multi-view.
1354
10
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1355
10
      const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1356
10
      const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1357
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1358
10
      ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1359
10
      ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1360
10
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1361
10
        intv,
1362
10
        outv,
1363
10
      }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI, mv);
1364
10
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1365
10
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1366
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1367
10
      ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1368
10
    }
1369
  // Now it is time to handle alias.
1370
37.9k
  for (i = 0; i < alloc_prep->block_size; 
i++31.8k
)
1371
31.8k
    if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1372
31.7k
    {
1373
31.7k
      const int block_ref = alloc_prep->blocks[i].block_ref;
1374
31.7k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1375
2.79k
      {
1376
        // Assigning out the tensor aliases.
1377
2.79k
        assert(tensor_symbol_info[block_ref].alias_ref);
1378
2.79k
        const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1379
        // It referenced to is not an alias.
1380
2.79k
        assert(tensor_arena->vt_tensors[alias_ref]);
1381
2.79k
        const int alias_pos = (int)(intptr_t)tensor_arena->vt_tensors[alias_ref];
1382
2.79k
        const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1383
2.79k
        assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr));
1384
        // Will use that to determine whether insert reference or not.
1385
2.79k
        const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr);
1386
2.81k
        while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr))
1387
13
        {
1388
13
          const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1389
13
          alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1390
13
        }
1391
2.79k
        const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1392
        // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1393
2.79k
        int pos;
1394
2.79k
        if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1395
2.79k
          
memcmp(tensor_symbol_info[block_ref].inc, tensor_symbol_info[block_ref].info.dim, sizeof(int) * 2.76k
CCV_NNC_MAX_DIM_ALLOC2.76k
) == 0)
1396
2.72k
        {
1397
2.72k
          pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1398
2.72k
          ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1399
2.72k
          *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1400
2.72k
        } else {
1401
71
          pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1402
71
          ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1403
          // Otherwise initialize a tensor view
1404
71
          *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].inc);
1405
71
          tensor_view->alias_ref = (uintptr_t)alias_pos;
1406
71
        }
1407
2.79k
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1408
2.79k
        if (is_multiview)
1409
13
        {
1410
13
          ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1411
13
          ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1412
13
        }
1413
2.79k
      }
1414
31.7k
    }
1415
  // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1416
6.15k
  
for (i = 0; 6.10k
i < tensor_arena->sub_arena_size;
i++50
)
1417
50
    if (tensor_arena->sub_arenas[i])
1418
49
    {
1419
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1420
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1421
138
      for (j = 0; j < node->input_size; 
j++89
)
1422
89
      {
1423
89
        const int idx = node->inputs[j];
1424
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1425
89
        if (s_idx < 0)
1426
23
          continue;
1427
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1428
        // Only do the replacement if it is a multi-view tensor.
1429
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1430
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor) &&
!18
TENSOR_EXPECT_UNASSIGNED18
(tensor_blocks[idx]))
1431
18
        {
1432
          // It cannot be binded tensor.
1433
18
          assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]));
1434
18
          const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1435
18
          const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1436
18
          ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? 
sub_arena_out_tensors[idx]1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)17
;
1437
          // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1438
          // to this tensor.
1439
18
          if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor))
1440
6
          {
1441
6
            const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1442
6
            ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1443
6
            ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? 
vt_tensor1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)5
);
1444
6
            ref_tensor->alias_ref = is_sub_arena_out_tensor ? 
(uintptr_t)vt_tensor1
:
(uintptr_t)vt_pos5
;
1445
6
            ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1446
6
            ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0]) ? 
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)5
CCV_NNC_MULTIVIEW_DATA5
(multiview)[0]) :
CCV_NNC_MULTIVIEW_DATA1
(multiview)[0]1
);
1447
6
            while (CCV_IS_TENSOR_MULTIVIEW(tv))
1448
0
              tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]);
1449
6
            *ref_tensor = ccv_nnc_tensor(tv->data.ptr, tv->info, 0);
1450
6
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1451
6
          } else
1452
12
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? 
vt_tensor0
: (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1453
18
        }
1454
66
      }
1455
49
    }
1456
  // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1457
  // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1458
  // when initialize case..of node, which will take the phi multi-view again.
1459
100k
  
for (i = 0; 6.10k
i < tensor_symbol_info_size;
i++94.0k
)
1460
94.0k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1461
10
    {
1462
10
      assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]));
1463
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1464
10
      assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI);
1465
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1466
10
    }
1467
  // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1468
100k
  
for (i = 0; 6.10k
i < tensor_symbol_info_size;
i++94.0k
)
1469
94.0k
    if (tensor_arena->vt_tensors[i])
1470
84.4k
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1471
  // Associate multiview tensors from sub arena to the parent.
1472
6.10k
  if (sub_arena_out_tensors)
1473
29
  {
1474
242
    for (i = 0; i < alloc_prep->block_size; 
i++213
)
1475
213
      if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1476
113
      {
1477
113
        const int block_ref = alloc_prep->blocks[i].block_ref;
1478
113
        if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))
1479
0
          continue;
1480
113
        int sub_arena_ref = block_ref;
1481
113
        if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1482
10
        {
1483
          // Assigning out the tensor aliases.
1484
10
          assert(tensor_symbol_info[block_ref].alias_ref);
1485
10
          const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1486
          // It referenced to is not an alias.
1487
10
          assert(tensor_arena->vt_tensors[alias_ref]);
1488
10
          sub_arena_ref = alias_ref;
1489
10
          if (!sub_arena_out_tensors[sub_arena_ref])
1490
3
            continue;
1491
10
        }
1492
110
        if (!sub_arena_out_tensors[sub_arena_ref])
1493
86
          continue;
1494
24
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref]) ? 
sub_arena_out_tensors[sub_arena_ref]23
:
(ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref1
);
1495
24
        assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1496
        // This is only possible if the vt_tensors is a phi node.
1497
24
        if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1498
0
        {
1499
          // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1500
0
          ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1501
0
          assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI);
1502
0
          assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]));
1503
0
          CCV_NNC_MULTIVIEW_DATA(phi)[1]->alias_ref = (uintptr_t)mv;
1504
0
          ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)[1]);
1505
24
        } else {
1506
24
          tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1507
24
          ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1508
24
        }
1509
24
      }
1510
29
  }
1511
  // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1512
  // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1513
  // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1514
  // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1515
  // to the output of assign_ref tensor.
1516
100k
  
for (i = 0; 6.10k
i < tensor_symbol_info_size;
i++94.0k
)
1517
94.0k
    if (tensor_arena->vt_tensors[i] && 
tensor_symbol_info[i].assign_ref84.4k
)
1518
25
    {
1519
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1520
25
      ccv_nnc_tensor_t* assign_tensor;
1521
25
      if (sub_arena_out_tensors && 
sub_arena_out_tensors[assign_ref]3
)
1522
0
        assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref]) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1523
25
      else
1524
25
        assign_tensor = tensor_arena->vt_tensors[assign_ref];
1525
25
      ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1526
25
    }
1527
  // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1528
53.6k
  for (i = 0; i < tensor_bind_size; 
i++47.5k
)
1529
47.5k
  {
1530
47.5k
    assert(tensor_binds[i].tensor);
1531
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1532
47.5k
    if (resolved_symbol.d >= 0)
1533
47.5k
    {
1534
47.5k
      int d = resolved_symbol.d;
1535
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1536
      // It has nothing to do with alias.
1537
47.7k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref46.7k
)
1538
146
        d = tensor_blocks[d].ref - 1;
1539
      // Note we don't trace back on alias. This is intentional.
1540
47.5k
      assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1541
47.5k
    }
1542
47.5k
  }
1543
6.10k
  if (sub_arena_out_tensors)
1544
29
    ccfree(sub_arena_out_tensors);
1545
  // Rewire sub arena's tensor references.
1546
6.15k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1547
50
    if (tensor_arena->sub_arenas[i])
1548
49
    {
1549
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1550
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1551
138
      for (j = 0; j < node->input_size; 
j++89
)
1552
89
      {
1553
89
        const int idx = node->inputs[j];
1554
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1555
89
        if (s_idx < 0)
1556
23
          continue;
1557
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1558
        // Only do the replacement if it is a multi-view tensor.
1559
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1560
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor))
1561
18
        {
1562
          // This is binded tensor, bind it now.
1563
18
          if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx]))
1564
0
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1565
18
          else
1566
18
            _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1567
18
        }
1568
66
      }
1569
49
    }
1570
6.10k
  return tensor_arena;
1571
6.10k
}
1572
1573
static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1574
17
{
1575
17
  assert(graph);
1576
17
  if ((intptr_t)graph == tensor_arena->graph_ref)
1577
7
  {
1578
7
    assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size);
1579
7
    return tensor_arena->vt_tensors[pair_ref];
1580
7
  }
1581
10
  int i;
1582
13
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++3
)
1583
10
    if (tensor_arena->sub_arenas[i])
1584
10
    {
1585
10
      ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1586
10
      if (tensor)
1587
7
        return tensor;
1588
10
    }
1589
3
  return 0;
1590
10
}
1591
1592
static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1593
7
{
1594
7
  if (!CCV_IS_TENSOR_MULTIVIEW(tensor))
1595
5
    tensor->type |= CCV_TAPE_ALLOC;
1596
2
  else {
1597
2
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1598
2
    mv->type |= CCV_TAPE_ALLOC;
1599
2
    int i;
1600
5
    for (i = 0; i < mv->repeat + mv->kind; 
i++3
)
1601
3
      _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1602
2
  }
1603
7
}
1604
1605
static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1606
6.10k
{
1607
6.10k
  assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
1608
6.10k
  int i;
1609
100k
  for (i = 0; i < graph_prep->tensor_symbol_info_size; 
i++94.0k
)
1610
94.0k
  {
1611
94.0k
    if (graph_prep->tensor_symbol_info[i].pair_ref)
1612
7
    {
1613
7
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1614
      // No need to continue check this if it is from its pair.
1615
7
      continue;
1616
7
    }
1617
94.0k
    if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && 
tensor_arena->vt_tensors[i]7
)
1618
7
    {
1619
      // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1620
7
      if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i]))
1621
5
      {
1622
5
        const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1623
5
        if (vt_ref >= 0 &&
1624
5
          TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref]) == READ_ONLY)
1625
3
          continue;
1626
5
      }
1627
4
      _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1628
4
    }
1629
94.0k
  }
1630
6.15k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
1631
50
    if (graph_prep->sub_preps[i])
1632
49
      _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1633
6.10k
}
1634
1635
static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1636
132k
{
1637
132k
  int i, found = 0;
1638
  // Try to insert head.
1639
132k
  ccv_array_t* head = tensor_blocks.head;
1640
132k
  assert(head);
1641
135k
  
for (i = 0; 132k
i < head->rnum;)
1642
62.3k
  {
1643
62.3k
    const int head_idx = *(int*)ccv_array_get(head, i);
1644
62.3k
    if (head_idx == idx)
1645
114
    {
1646
114
      found = 1;
1647
114
      break;
1648
114
    }
1649
62.2k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1650
62.2k
    if (cell.i32 && 
cell.i32[0] > 041
)
1651
41
    {
1652
      /* If the current node is the parent of the head node, check if we found it or not. */
1653
      /* If not found, replace the current one. */
1654
41
      if (!found)
1655
41
      {
1656
41
        found = 1;
1657
41
        *(int*)ccv_array_get(head, i) = idx;
1658
41
      } else {
1659
        /* Remove the current one, change the rnum. */
1660
0
        if (i < head->rnum - 1)
1661
0
          *(int*)ccv_array_get(head, i) = *(int*)ccv_array_get(head, head->rnum - 1);
1662
0
        --head->rnum;
1663
0
        continue;
1664
0
      }
1665
62.1k
    } else {
1666
      // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1667
62.1k
      cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1668
62.1k
      if (cell.i32 && 
cell.i32[0] > 059.9k
)
1669
59.9k
      {
1670
59.9k
        found = 1;
1671
59.9k
        break;
1672
59.9k
      }
1673
62.1k
    }
1674
    /* Advancing i. */
1675
2.27k
    ++i;
1676
2.27k
  }
1677
  /* If not found, push this idx to the end of the array. */
1678
132k
  if (!found)
1679
72.6k
    ccv_array_push(head, &idx);
1680
  // Try to insert tail.
1681
132k
  found = 0;
1682
132k
  ccv_array_t* tail = tensor_blocks.tail;
1683
132k
  assert(tail);
1684
191k
  
for (i = 0; 132k
i < tail->rnum;)
1685
63.4k
  {
1686
63.4k
    const int tail_idx = *(int*)ccv_array_get(tail, i);
1687
63.4k
    if (tail_idx == idx)
1688
4.46k
    {
1689
4.46k
      found = 1;
1690
4.46k
      break;
1691
4.46k
    }
1692
59.0k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1693
59.0k
    if (cell.i32 && 
cell.i32[0] > 056.4k
)
1694
56.4k
    {
1695
      /* If the current node is the child of the tail node, check if we found it or not. */
1696
      /* If not found, replace the current one. */
1697
56.4k
      if (!found)
1698
55.4k
      {
1699
55.4k
        found = 1;
1700
55.4k
        *(int*)ccv_array_get(tail, i) = idx;
1701
55.4k
      } else {
1702
        /* Remove the current one, change the rnum. */
1703
1.08k
        *(int*)ccv_array_get(tail, i) = *(int*)ccv_array_get(tail, tail->rnum - 1);
1704
1.08k
        --tail->rnum;
1705
1.08k
        continue;
1706
1.08k
      }
1707
56.4k
    } else {
1708
      // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1709
2.52k
      cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1710
2.52k
      if (cell.i32 && 
cell.i32[0] > 0110
)
1711
110
      {
1712
110
        found = 1;
1713
110
        break;
1714
110
      }
1715
2.52k
    }
1716
    /* Advancing i. */
1717
57.8k
    ++i;
1718
57.8k
  }
1719
  /* If not found, push this idx to the end of the array. */
1720
132k
  if (!found)
1721
72.8k
    ccv_array_push(tail, &idx);
1722
132k
}
1723
1724
ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1725
6.66k
{
1726
6.66k
  if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1727
6.56k
  {
1728
6.56k
    assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size);
1729
6.56k
    ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1730
6.56k
    if (tensor && 
CCV_IS_TENSOR_MULTIVIEW6.55k
(tensor))
1731
11
    {
1732
11
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1733
22
      while (CCV_IS_TENSOR_MULTIVIEW(mv))
1734
11
        mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? 
mv->it1
:
CCV_NNC_MULTIVIEW_DATA10
(mv)[0]10
);
1735
11
      return (ccv_nnc_tensor_t*)mv;
1736
11
    }
1737
6.54k
    return tensor;
1738
6.56k
  }
1739
100
  int i;
1740
123
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++23
)
1741
99
    if (tensor_arena->sub_arenas[i])
1742
99
    {
1743
99
      ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1744
99
      if (tensor)
1745
76
        return tensor;
1746
99
    }
1747
24
  return 0;
1748
100
}
1749
1750
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1751
66.6k
{
1752
66.6k
  if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1753
66.6k
  {
1754
66.6k
    assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size);
1755
66.6k
    return graph_exec_arena->graph_execs[symbol.d];
1756
66.6k
  }
1757
7
  int i;
1758
9
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++2
)
1759
7
    if (graph_exec_arena->sub_arenas[i])
1760
7
    {
1761
7
      ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1762
7
      if (!CCV_NO_GRAPH_EXEC(exec))
1763
5
        return exec;
1764
7
    }
1765
2
  return (ccv_nnc_graph_exec_t){}; // 0.
1766
7
}
1767
1768
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1769
9
{
1770
9
  return graph_exec_arena->source;
1771
9
}
1772
1773
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1774
9
{
1775
9
  return graph_exec_arena->destination;
1776
9
}
1777
1778
// Check whether the head is the beginning of this block.
1779
static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1780
50
{
1781
50
  assert(tensor_block->head);
1782
50
  return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0) == head_node);
1783
50
}
1784
1785
// Check whether the tail is the end of this block.
1786
static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
1787
39
{
1788
39
  assert(tensor_block->tail);
1789
39
  return (tensor_block->tail->rnum == 1 && 
*(int*)36
ccv_array_get36
(tensor_block->tail, 0) == tail_node);
1790
39
}
1791
1792
// Make two tensor blocks one. Return 1 if that happened.
1793
static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
1794
6.43k
{
1795
  // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
1796
6.43k
  if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]) &&
1797
6.43k
    
(6.40k
!6.40k
TENSOR_IS_UNFOLDABLE_AS_OUTPUT6.40k
(tensor_blocks[p_ref_1]) ||
tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 118
) &&
1798
6.43k
    
tensor_blocks[p_ref_0].tail->rnum == 16.38k
&&
1799
6.43k
    
tensor_blocks[p_ref_1].head->rnum == 16.38k
&&
1800
6.43k
    
tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type6.38k
&& // Must be the same type.
1801
6.43k
    
*(int*)6.37k
ccv_array_get6.37k
(tensor_blocks[p_ref_0].tail, 0) == *(int*)
ccv_array_get6.37k
(tensor_blocks[p_ref_1].head, 0))
1802
6.18k
  {
1803
    // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
1804
6.18k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]));
1805
6.18k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]));
1806
6.18k
    ccv_array_free(tensor_blocks[p_ref_0].tail);
1807
6.18k
    tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
1808
6.18k
    if (tensor_blocks[p_ref_1].p_refs[0])
1809
14
    {
1810
14
      assert(tensor_blocks[p_ref_1].p_refs[1] == 0); // It simply cannot have more than one p_refs, otherwise we cannot merge.
1811
14
      if (!tensor_blocks[p_ref_0].p_refs[0])
1812
10
        tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
1813
4
      else
1814
4
        tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
1815
14
    }
1816
6.18k
    tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
1817
6.18k
    TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]));
1818
6.18k
    ccv_array_free(tensor_blocks[p_ref_1].head);
1819
6.18k
    if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1]))
1820
16
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]);
1821
    // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
1822
6.18k
    TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1]);
1823
6.18k
    tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
1824
6.18k
    if (!tensor_blocks[p_ref_0].r_refs)
1825
6.12k
      tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
1826
6.18k
    ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
1827
6.18k
    tensor_blocks[p_ref_1].size = 0;
1828
6.18k
    tensor_blocks[p_ref_1].head = 0;
1829
6.18k
    tensor_blocks[p_ref_1].tail = 0;
1830
6.18k
    return 1;
1831
6.18k
  }
1832
247
  return 0;
1833
6.43k
}
1834
1835
static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
1836
6.12k
{
1837
6.12k
  int i, j, k;
1838
  // Generate exec dependencies (or, in other words, partial ordering of executions).
1839
6.12k
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
1840
6.12k
  int* buf = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
1841
6.12k
  int buf_size;
1842
6.12k
  if (p_node_info)
1843
62
    { assert(output_size == 0); }
1844
6.12k
#define for_block(x, val) \
1845
210k
  do { \
1846
210k
    if (((int32_t*)val)[0] > 0) \
1847
210k
    { \
1848
210k
      buf[buf_size * 2] = x; \
1849
210k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
1850
210k
      ++buf_size; \
1851
210k
    } \
1852
210k
  } while (0)
1853
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
1854
31.8k
    buf_size = 0; /* save all its parent deps to this buffer */
1855
31.8k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
1856
31.8k
    if (vector)
1857
210k
      
CCV_SPARSE_VECTOR_FOREACH25.4k
(exec_dep, vector, for_block);
1858
31.8k
    if (!node->outgoings)
1859
6.77k
      continue;
1860
53.1k
    
for (i = 0; 25.1k
i < node->outgoings->rnum;
i++28.0k
)
1861
28.0k
    {
1862
28.0k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
1863
28.0k
      const int32_t one = 1;
1864
28.0k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
1865
      /* If not found, set, if the current node is the destination node, no need 
1866
       * set itself as parent of subsequent nodes because its terminal nature. */
1867
28.0k
      if (!term && 
(27.5k
!cell.i3227.5k
||
cell.i32[0] == 00
))
1868
27.5k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
1869
264k
      for (j = 0; j < buf_size; 
j++236k
) /* set with all idx's dependencies as well */
1870
236k
      {
1871
236k
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
1872
        /* If not found, set */
1873
236k
        if (!cell.i32 || 
cell.i32[0] == 031.9k
)
1874
204k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
1875
31.9k
        else {
1876
          /* Otherwise, set to the longest one */
1877
31.9k
          int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
1878
31.9k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
1879
31.9k
        }
1880
236k
      }
1881
28.0k
    }
1882
25.1k
  } ccv_nnc_graph_visit_endfor
1883
6.12k
#undef for_block
1884
6.12k
  ccfree(buf);
1885
  // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
1886
6.12k
  const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
1887
6.12k
  ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
1888
  // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
1889
  // happens that I have to loop through all relevant node to find out if one is used or not.
1890
100k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++94.1k
)
1891
94.1k
    tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
1892
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
1893
121k
    for (i = 0; i < node->input_size; 
i++90.0k
)
1894
90.0k
      if (node->inputs[i] >= 0)
1895
65.8k
      {
1896
65.8k
        tensor_blocks[node->inputs[i]].flags = 0;
1897
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
1898
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
1899
65.8k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
1900
65.8k
          
(57.6k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD57.6k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD57.5k
))
1901
12
          tensor_blocks[node->inputs[i]].pin_mem = 1;
1902
65.8k
      }
1903
81.8k
    for (i = 0; i < node->output_size; 
i++49.9k
)
1904
49.9k
      if (node->outputs[i] >= 0)
1905
43.0k
      {
1906
43.0k
        tensor_blocks[node->outputs[i]].flags = 0;
1907
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
1908
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
1909
43.0k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
1910
43.0k
          
(37.4k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD37.4k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD37.4k
))
1911
10
          tensor_blocks[node->outputs[i]].pin_mem = 1;
1912
43.0k
      }
1913
31.8k
  } ccv_nnc_graph_visit_endfor
1914
6.12k
  if (p_node_info)
1915
62
  {
1916
62
    assert(p_tensor_symbol_info);
1917
    // Mark it as used if it is used in either input or output.
1918
165
    
for (i = 0; 62
i < p_node_info->input_size;
i++103
)
1919
103
      if (p_node_info->inputs[i] >= 0)
1920
103
      {
1921
103
        const int d = p_node_info->inputs[i];
1922
103
        if (p_tensor_symbol_info[d].s_ref && 
p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx101
)
1923
92
        {
1924
92
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
1925
92
          if (dd >= 0) // If this exists in this sub-graph, great.
1926
80
            tensor_blocks[dd].flags = 0;
1927
92
        }
1928
103
      }
1929
132
    for (i = 0; i < p_node_info->output_size; 
i++70
)
1930
70
      if (p_node_info->outputs[i] >= 0)
1931
70
      {
1932
70
        const int d = p_node_info->outputs[i];
1933
70
        if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
1934
70
        {
1935
70
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
1936
70
          if (dd >= 0) // If this exists in this sub-graph, great.
1937
70
            tensor_blocks[dd].flags = 0;
1938
70
        }
1939
70
      }
1940
62
  }
1941
100k
  
for (i = 0; 6.12k
i < symbolic_graph->tensor_symbol_info->rnum;
i++94.1k
)
1942
94.1k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
1943
73.2k
    {
1944
      // Check no tensor info is auto now.
1945
73.2k
      assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info));
1946
      // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
1947
      // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
1948
      // fold to).
1949
73.2k
      if (tensor_symbol_info[i].assign_ref)
1950
40
      {
1951
        // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
1952
        // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
1953
        // it kept its own representation, which is not the case for output).
1954
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
1955
40
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1956
        // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
1957
40
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref]);
1958
        // It also cannot be folded as output (except i), because we need to keep its own representation.
1959
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref]);
1960
40
        assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0);
1961
40
        tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
1962
63
        for (j = 0; j < unroll_count; 
j++23
)
1963
23
        {
1964
23
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
1965
23
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
1966
23
        }
1967
40
        if (tensor_blocks[assign_ref].bypass_ref)
1968
4
        {
1969
          // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
1970
4
          tensor_blocks[assign_ref].unfoldable_except_ref = 0;
1971
4
          const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
1972
4
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref]);
1973
4
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref]);
1974
          // On the other hand, it can be folded into the except_ref for the bypass_ref.
1975
4
          tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
1976
4
          if (dup_tensor_from_ref)
1977
2
          {
1978
2
            const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
1979
2
            if (bypass_from_ref >= 0)
1980
2
            {
1981
2
              TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref]);
1982
2
              TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref]);
1983
2
              assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref);
1984
2
              for (j = 0; j < unroll_count - 1; 
j++0
)
1985
0
              {
1986
                // Mark every incarnation as unfold-able.
1987
0
                TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
1988
0
                TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
1989
0
              }
1990
2
            }
1991
2
          }
1992
4
        }
1993
40
      }
1994
73.2k
    }
1995
100k
  
for (i = 0; 6.12k
i < symbolic_graph->tensor_symbol_info->rnum;
i++94.1k
)
1996
94.1k
  {
1997
    // If it has a pair reference, we don't need to allocate this tensor at all,
1998
    // set it to be unassigned.
1999
94.1k
    if (tensor_symbol_info[i].pair_ref)
2000
15
      TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i]);
2001
    // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2002
94.1k
    else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2003
7
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2004
7
      TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
2005
      // For this case, there is no exception.
2006
7
      tensor_blocks[i].unfoldable_except_ref = 0;
2007
94.1k
    } else if (tensor_symbol_info[i].p_ref) {
2008
119
      assert(p_node_info);
2009
119
      const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2010
      // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2011
119
      if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2012
        // TODO: This check can be lifted if we can fold in the parent graph.
2013
48
        if (-1 == p_ref_is_in_or_out)
2014
20
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2015
119
      if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2016
68
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2017
119
    }
2018
94.1k
  }
2019
100k
  
for (i = 0; 6.12k
i < symbolic_graph->tensor_symbol_info->rnum;
i++94.1k
)
2020
94.1k
  {
2021
94.1k
    if (tensor_symbol_info[i].alias_ref)
2022
3.50k
    {
2023
3.50k
      const int ref = tensor_symbol_info[i].alias_ref - 1;
2024
      // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2025
3.50k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
!1.71k
TENSOR_EXPECT_UNASSIGNED1.71k
(tensor_blocks[i]))
2026
1.03k
        tensor_blocks[ref].flags = 0;
2027
      // An alias cannot ref to another alias.
2028
3.50k
      assert(!tensor_symbol_info[ref].alias_ref);
2029
3.50k
      tensor_blocks[i].flags = ALIAS;
2030
3.50k
      tensor_blocks[i].ref = ref + 1; // Assign the ref.
2031
3.50k
      if (!tensor_blocks[ref].r_refs)
2032
3.46k
        tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2033
3.50k
      ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2034
3.50k
    }
2035
94.1k
  }
2036
  // Scan again and if the ref is not assigned, mark the alias not assigned.
2037
100k
  
for (i = 0; 6.12k
i < symbolic_graph->tensor_symbol_info->rnum;
i++94.1k
)
2038
94.1k
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
2039
3.50k
    {
2040
3.50k
      const int ref = tensor_blocks[i].ref - 1;
2041
3.50k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]))
2042
676
      {
2043
        // Mark this as unassigned.
2044
676
        tensor_blocks[i].flags = UNASSIGNED;
2045
676
        tensor_blocks[i].ref = 0;
2046
676
      }
2047
3.50k
    }
2048
100k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++94.1k
)
2049
94.1k
  {
2050
    // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2051
94.1k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
2052
71.4k
    {
2053
71.4k
      tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2054
71.4k
      tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2055
      // Cache tensor size (align to 16 bytes).
2056
71.4k
      tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2057
71.4k
    }
2058
    // If there is a p_ref, add the one to the p_refs list.
2059
94.1k
    if (tensor_symbol_info[i].p_ref)
2060
128
      tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2061
94.1k
  }
2062
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2063
121k
    for (i = 0; i < node->input_size; 
i++90.0k
)
2064
90.0k
    {
2065
90.0k
      int d = node->inputs[i];
2066
90.0k
      if (d < 0)
2067
24.2k
        continue;
2068
65.8k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2069
1.70k
        d = tensor_symbol_info[d].alias_ref - 1;
2070
65.8k
      tensor_blocks[d].flags |= READ_ONLY;
2071
65.8k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2072
15
        continue;
2073
65.7k
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2074
      /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2075
       * from the very beginning of the graph life-cycle and ends here. */
2076
65.7k
      if (tensor_blocks[d].head->rnum == 0 && 
!28.3k
TENSOR_REQUIRE_INIT28.3k
(tensor_symbol_info[d].flags))
2077
28.3k
      {
2078
89.1k
        for (j = 0; j < source_size; 
j++60.8k
)
2079
60.8k
        {
2080
          // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2081
60.8k
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2082
60.8k
          if (cell.i32 && 
cell.i32[0] > 023.7k
)
2083
23.7k
            _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2084
60.8k
        }
2085
        /* If this is a read-only (based on SSA, if first encountered as read), and this is
2086
         * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2087
         * loop, however, in that case, you need to prevent read-only gets reused for the
2088
         * output tensor, which is not obvious how to implement correctly), and it is not
2089
         * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2090
         * of memory anyway (because on second loop, we want to read the same value out).
2091
         * Mark it to the end of the graph. */
2092
28.3k
        if (p_node_info && 
!tensor_symbol_info[d].assign_ref146
)
2093
210
          
for (j = 0; 105
j < destination_size;
j++105
)
2094
105
          {
2095
            // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2096
105
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2097
105
            if (cell.i32 && 
cell.i32[0] > 065
)
2098
65
              _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2099
105
          }
2100
28.3k
      }
2101
65.7k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2102
65.7k
    }
2103
81.8k
    
for (i = 0; 31.8k
i < node->output_size;
i++49.9k
)
2104
49.9k
    {
2105
49.9k
      int d = node->outputs[i];
2106
49.9k
      if (d < 0)
2107
6.87k
        continue;
2108
43.0k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2109
1.35k
        d = tensor_symbol_info[d].alias_ref - 1;
2110
43.0k
      tensor_blocks[d].flags |= WRITE_ONLY;
2111
43.0k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2112
0
        continue;
2113
43.0k
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2114
43.0k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2115
43.0k
    }
2116
31.8k
  } ccv_nnc_graph_visit_endfor
2117
  // For any assign_ref, its life-time kept until the end and wrap over.
2118
100k
  
for (i = 0; 6.12k
i < symbolic_graph->tensor_symbol_info->rnum;
i++94.1k
)
2119
    // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2120
    // that "somewhere else" need to keep its life-time til the end.
2121
94.1k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) &&
2122
94.1k
      
p_node_info71.4k
&&
tensor_symbol_info[i].assign_ref282
)
2123
42
    {
2124
42
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2125
84
      for (j = 0; j < destination_size; 
j++42
)
2126
42
      {
2127
        // This logic is to be more conservative about which destination we add to.
2128
        // As of now, if we add everything, it is fine most likely. However, it may
2129
        // cause issues in the future to do so naively. Thus, instead, we only add
2130
        // the destination to it iff either the tensor is not used at all, or, the
2131
        // destination is on the same stream as of the tensor block some way.
2132
42
        int flag = !tensor_blocks[assign_ref].tail;
2133
83
        for (k = 0; !flag && 
k < tensor_blocks[assign_ref].tail->rnum73
;
k++41
)
2134
41
        {
2135
41
          const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k);
2136
41
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2137
41
          flag = (cell.i32 && 
cell.i32[0] > 010
);
2138
41
        }
2139
42
        if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2140
10
          _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2141
42
      }
2142
42
    }
2143
6.19k
  for (i = 0; i < output_size; 
i++70
)
2144
70
  {
2145
70
    assert(outputs[i].graph == symbolic_graph);
2146
70
    int d = outputs[i].d;
2147
70
    if (d < 0)
2148
0
      continue;
2149
70
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2150
0
      d = tensor_symbol_info[d].alias_ref - 1;
2151
70
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2152
0
      continue;
2153
70
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2154
286
    
for (j = 0; 70
j < destination_size;
j++216
)
2155
216
    {
2156
216
      int flag = !tensor_blocks[d].tail;
2157
432
      for (k = 0; !flag && 
k < tensor_blocks[d].tail->rnum416
;
k++216
)
2158
216
      {
2159
216
        const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
2160
216
        const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2161
216
        flag = (cell.i32 && 
cell.i32[0] > 016
);
2162
216
      }
2163
216
      if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2164
16
        _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2165
216
    }
2166
70
  }
2167
  // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2168
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2169
31.8k
    int x, y;
2170
121k
    for (x = 0; x < node->input_size; 
x++90.0k
)
2171
258k
      
for (y = 0; 90.0k
y < node->output_size;
y++168k
)
2172
        /* Some operations enforces some tensors to be the same for inputs / outputs. */
2173
168k
        if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2174
180
        {
2175
          // If both unassigned, it is fine.
2176
180
          if (node->inputs[x] < 0 && 
node->outputs[y] < 00
)
2177
0
            continue;
2178
180
          int ref = node->inputs[x];
2179
180
          assert(ref >= 0);
2180
180
          while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref0
)
2181
0
            ref = tensor_blocks[ref].ref - 1;
2182
180
          const int node_output_y = node->outputs[y];
2183
180
          assert(node_output_y >= 0);
2184
          // If both are not computable, it is fine, we don't need to enforce.
2185
180
          if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2186
180
            
!0
TENSOR_EXPECT_COMPUTABLE0
(tensor_blocks[node_output_y]))
2187
0
            continue;
2188
          // Otherwise, enforce and error out if failed.
2189
180
          if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2190
0
            { assert(0 && "cannot enforce inplace for the two tensors"); }
2191
180
        }
2192
31.8k
  } ccv_nnc_graph_visit_endfor
2193
  // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2194
  // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2195
  // that is not enforced in-place (because the tensor enforced in-place will be different than the
2196
  // binding one).
2197
53.7k
  
for (i = 0; 6.12k
i < tensor_bind_size;
i++47.5k
)
2198
47.5k
  {
2199
47.5k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2200
    // If there is a tensor binded, then it is unassigned.
2201
47.5k
    if (resolved_symbol.d >= 0)
2202
47.5k
    {
2203
47.5k
      int d = resolved_symbol.d;
2204
      // I cannot assert too much at this moment.
2205
47.5k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2206
1.02k
        d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2207
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
2208
      // It has nothing to do with alias.
2209
47.7k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref11.5k
)
2210
146
        d = tensor_blocks[d].ref - 1;
2211
      // Doesn't work if this is a loop carrying variable.
2212
47.5k
      assert(!tensor_symbol_info[d].assign_ref);
2213
47.5k
      tensor_blocks[d].flags = UNASSIGNED;
2214
47.5k
      tensor_blocks[d].ref = 0; // No need to have ref as well.
2215
47.5k
    }
2216
47.5k
  }
2217
  // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2218
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2219
31.8k
    int x, y;
2220
121k
    for (x = 0; x < node->input_size; 
x++90.0k
)
2221
90.0k
    {
2222
      /* If the input is not assigned, it can be referenced, find the referenced one */
2223
90.0k
      int ref = node->inputs[x];
2224
90.0k
      if (ref < 0)
2225
24.2k
        continue;
2226
72.9k
      
while (65.8k
!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
tensor_blocks[ref].ref40.2k
)
2227
7.17k
        ref = tensor_blocks[ref].ref - 1;
2228
65.8k
      assert(tensor_blocks[ref].ref == 0);
2229
65.8k
      const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2230
65.8k
      if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2231
65.8k
        
tensor_blocks[ref].tail->rnum == 132.7k
)
2232
84.5k
        
for (y = 0; 32.2k
y < node->output_size;
y++52.2k
)
2233
          /* Only proceed if the input symbol is different from the output symbol, */
2234
          /* and the input symbol meets the output symbol exactly at the same spot. */
2235
52.2k
          if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2236
52.2k
            
node->outputs[y] >= 013.0k
&&
2237
52.2k
            
ref != node->outputs[y]13.0k
&&
2238
52.2k
            
TENSOR_EXPECT_COMPUTABLE13.0k
(tensor_blocks[node->outputs[y]]))
2239
6.31k
          {
2240
6.31k
            const int node_output_y = node->outputs[y];
2241
6.31k
            const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2242
            /* If dimension matches perfectly, then we can assign y_symbol to x. */
2243
6.31k
            if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
2244
6.25k
              _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2245
6.31k
          }
2246
65.8k
    }
2247
31.8k
  } ccv_nnc_graph_visit_endfor
2248
  // Specifically handle the bypass. This need to be done after the first pass.
2249
  // I need to extend the bypass life-time to the same as the one I am going with.
2250
  // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2251
6.12k
  ccv_nnc_tensor_block_t empty_block = {};
2252
6.12k
  empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2253
6.12k
  empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2254
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2255
31.8k
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2256
13
    {
2257
13
      int can_bypass = 1;
2258
28
      for (i = 0; can_bypass && 
i < node->output_size25
;
i++15
)
2259
15
      {
2260
15
        int d = node->outputs[i];
2261
15
        if (d < 0)
2262
0
          continue;
2263
15
        if (!tensor_blocks[d].bypass_ref)
2264
2
          continue;
2265
13
        while (tensor_blocks[d].ref)
2266
0
          d = tensor_blocks[d].ref - 1;
2267
13
        int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2268
14
        while (tensor_blocks[bypass_ref].ref)
2269
1
          bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2270
        // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2271
13
        if (!tensor_symbol_info[bypass_ref].assign_ref && 
!tensor_symbol_info[bypass_ref].r_assign_ref10
)
2272
10
          continue;
2273
3
        ccv_array_clear(empty_block.head);
2274
6
        for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; 
j++3
)
2275
3
          ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j));
2276
3
        ccv_array_clear(empty_block.tail);
2277
6
        for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; 
j++3
)
2278
3
          ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j));
2279
6
        for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++3
)
2280
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), empty_block);
2281
6
        for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++3
)
2282
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), empty_block);
2283
        // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2284
3
        assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref));
2285
3
        int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : 
tensor_symbol_info[bypass_ref].r_assign_ref - 10
;
2286
3
        while (tensor_blocks[b_ref].ref)
2287
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2288
3
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2289
3
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2290
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2291
        // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2292
3
        can_bypass = can_bypass && (a_hop_b || b_hop_a);
2293
3
      }
2294
13
      if (can_bypass)
2295
10
      {
2296
22
        for (i = 0; i < node->output_size; 
i++12
)
2297
12
        {
2298
12
          int d = node->outputs[i];
2299
12
          if (d < 0)
2300
0
            continue;
2301
12
          if (!tensor_blocks[d].bypass_ref)
2302
2
            continue;
2303
10
          while (tensor_blocks[d].ref)
2304
0
            d = tensor_blocks[d].ref - 1;
2305
10
          int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2306
10
          while (tensor_blocks[bypass_ref].ref)
2307
0
            bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2308
          // The bypass_ref can extend its life-time.
2309
20
          for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++10
)
2310
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), tensor_blocks[bypass_ref]);
2311
20
          for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++10
)
2312
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), tensor_blocks[bypass_ref]);
2313
10
        }
2314
10
      } else {
2315
6
        for (i = 0; i < node->output_size; 
i++3
)
2316
3
          tensor_blocks[node->outputs[i]].bypass_ref = 0;
2317
3
        const int exec_idx = (dup_exec_from_ref) ? 
dup_exec_from_ref[idx]1
:
idx2
;
2318
        // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2319
3
        exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2320
3
      }
2321
13
    }
2322
31.8k
  } ccv_nnc_graph_visit_endfor
2323
6.12k
  ccv_array_free(empty_block.head);
2324
6.12k
  ccv_array_free(empty_block.tail);
2325
6.12k
  *r_exec_dep = exec_dep;
2326
6.12k
  *r_tensor_blocks = tensor_blocks;
2327
6.12k
}
2328
2329
static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2330
33
{
2331
33
  if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
cmd.cmd == CCV_NNC_GRAPH_BACKWARD30
)
2332
3
  {
2333
3
    ccv_nnc_cmd_t retval = cmd;
2334
3
    retval.cmd = CCV_NNC_NOOP;
2335
3
    return retval;
2336
3
  }
2337
30
  return cmd;
2338
33
}
2339
2340
static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2341
102
{
2342
102
  if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2343
47
  {
2344
47
    if (tensor_symbol_info[input].alias_ref)
2345
18
    {
2346
18
      const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2347
18
      assert(tensor_symbol_info[alias_ref].alias_ref == 0);
2348
18
      ccv_nnc_tensor_symbol_t tensor_symbol = {};
2349
18
      if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2350
6
      {
2351
6
        tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2352
6
        if (tensor_symbol_info[alias_ref].pair_ref)
2353
0
          ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2354
0
            .d = tensor_symbol_info[alias_ref].pair_ref - 1,
2355
0
            .graph = dup_graph->pair
2356
0
          });
2357
6
        ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2358
6
        dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2359
12
      } else {
2360
12
        tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2361
12
        tensor_symbol.graph = dup_graph;
2362
12
      }
2363
18
      ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].inc, tensor_symbol_info[input].info, 0);
2364
18
      if (tensor_symbol_info[input].pair_ref)
2365
0
        ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2366
0
          .d = tensor_symbol_info[input].pair_ref - 1,
2367
0
          .graph = dup_graph->pair
2368
0
        });
2369
18
      ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2370
18
      dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2371
29
    } else {
2372
29
      ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2373
29
      if (tensor_symbol_info[input].pair_ref)
2374
4
        ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2375
4
          .d = tensor_symbol_info[input].pair_ref - 1,
2376
4
          .graph = dup_graph->pair
2377
4
        });
2378
29
      ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2379
29
      dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2380
29
    }
2381
47
    if (tensor_symbol_info[input].bypass_ref)
2382
2
    {
2383
2
      const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2384
2
      assert(dup_bypass_ref >= 0);
2385
2
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count]);
2386
2
      symbol_info->bypass_ref = dup_bypass_ref + 1;
2387
2
    }
2388
47
  }
2389
102
  return (ccv_nnc_tensor_symbol_t) {
2390
102
    .d = dup_tensor_block_ref[input * unroll_count],
2391
102
    .graph = dup_graph,
2392
102
  };
2393
102
}
2394
2395
static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2396
72
{
2397
72
  int i;
2398
72
  if (dup_exec_ref[idx * unroll_count] < 0)
2399
44
  {
2400
    // Input has to come before output, because output could has a bypass reference to the input.
2401
116
    for (i = 0; i < node->input_size; 
i++72
)
2402
72
      max_inputs[i] = (node->inputs[i] >= 0) ? 
_ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i])71
:
(ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph }1
;
2403
75
    for (i = 0; i < node->output_size; 
i++31
)
2404
31
      max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : 
(ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph }0
;
2405
44
    ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2406
44
    dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2407
44
  }
2408
72
  return (ccv_nnc_graph_exec_symbol_t) {
2409
72
    .d = dup_exec_ref[idx * unroll_count],
2410
72
    .graph = dup_graph,
2411
72
  };
2412
72
}
2413
2414
static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2415
6.12k
{
2416
6.12k
  int i;
2417
100k
  for (i = 0; i < tensor_block_size; 
i++94.2k
)
2418
94.2k
  {
2419
94.2k
    if (tensor_blocks[i].head)
2420
65.3k
      ccv_array_free(tensor_blocks[i].head);
2421
94.2k
    if (tensor_blocks[i].tail)
2422
65.3k
      ccv_array_free(tensor_blocks[i].tail);
2423
94.2k
    if (tensor_blocks[i].r_refs)
2424
9.58k
      ccv_array_free(tensor_blocks[i].r_refs);
2425
94.2k
    if (tensor_blocks[i].dup_p_refs)
2426
22
      ccv_array_free(tensor_blocks[i].dup_p_refs);
2427
94.2k
  }
2428
6.12k
  ccfree(tensor_blocks);
2429
6.12k
}
2430
2431
// Find tensors that cannot be solved by co-allocating to the same location.
2432
static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2433
21
{
2434
21
  int i, j, unroll_count = 0;
2435
131
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++110
)
2436
110
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_symbol_info[i].assign_ref90
)
2437
25
    {
2438
      // This is is a parameter, thus, it has to be either an alias or used.
2439
25
      assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]));
2440
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2441
      // The parameter it assign to has to be either an alias or used.
2442
25
      assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]));
2443
      // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2444
      // If it is the same, we are good, no need to extend.
2445
25
      int a_ref = i;
2446
25
      while (tensor_blocks[a_ref].ref)
2447
0
        a_ref = tensor_blocks[a_ref].ref - 1;
2448
25
      int b_ref = assign_ref;
2449
31
      while (tensor_blocks[b_ref].ref)
2450
6
        b_ref = tensor_blocks[b_ref].ref - 1;
2451
25
      if (a_ref != b_ref)
2452
19
      {
2453
        // If any of the b's head is deterministically later than a's tail
2454
        // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2455
19
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2456
19
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2457
        // It cannot be that both i can hop to j can j can hop to i.
2458
19
        assert(!(a_hop_b > 0 && b_hop_a > 0));
2459
        // Can it be folded
2460
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2461
19
        if (a_hop_b || 
b_hop_a16
)
2462
3
        {
2463
3
          tensor_blocks[a_ref].companion_ref = b_ref + 1;
2464
3
          tensor_blocks[b_ref].companion_ref = a_ref + 1;
2465
3
          continue;
2466
3
        }
2467
16
        int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2468
20
        for (j = 0; c_ref >= 0; 
j++4
)
2469
4
        {
2470
4
          while (tensor_blocks[c_ref].ref)
2471
0
            c_ref = tensor_blocks[c_ref].ref - 1;
2472
4
          c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2473
4
        }
2474
16
        unroll_count = ccv_max(unroll_count, j + 1);
2475
16
      }
2476
25
    }
2477
  // Reset companion_ref if need to unroll.
2478
21
  if (unroll_count)
2479
91
    
for (j = 0; 13
j < symbolic_graph->tensor_symbol_info->rnum;
j++78
)
2480
78
      tensor_blocks[j].companion_ref = 0;
2481
21
  return unroll_count;
2482
21
}
2483
2484
static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2485
13
{
2486
13
  int i, j, n;
2487
  // The inout exec nodes, these are the nodes we are going to extend.
2488
13
  uint8_t* inout = (uint8_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2489
13
  int max_input_size = 0;
2490
13
  int max_output_size = 0;
2491
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2492
35
  {
2493
35
    max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size);
2494
35
    max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size);
2495
35
  }
2496
13
  ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)];
2497
13
  ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)];
2498
  // Doing graph expansion
2499
  // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2500
13
  assert(dup_graph->exec_symbol_info->rnum > 0);
2501
13
  assert(dup_graph->tensor_symbol_info->rnum > 0);
2502
88
#define INCOMING_NODE (1)
2503
28
#define OUTGOING_NODE (2)
2504
  // Unroll the graph n times.
2505
29
  
for (n = 0; 13
n < unroll_count;
n++16
)
2506
16
  {
2507
16
    int* const dup_exec_ref = r_dup_exec_ref + n;
2508
16
    const int* const prev_dup_tensor_block_ref = n > 0 ? 
r_dup_tensor_block_ref + (n - 1)3
:
013
;
2509
16
    int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2510
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2511
46
      dup_exec_ref[i * unroll_count] = -1;
2512
131
    for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++115
)
2513
115
    {
2514
      // If there is a assign_ref, that means I don't need to dup the tensor.
2515
115
      if (tensor_symbol_info[i].assign_ref)
2516
25
      {
2517
25
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2518
25
        dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? 
prev_dup_tensor_block_ref[assign_ref * unroll_count]8
:
assign_ref17
;
2519
90
      } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && 
TENSOR_READ_WRITE52
(tensor_blocks[i]) == READ_ONLY52
)
2520
      // If this is a read-only tensor block, no need to duplicate because the value never changes
2521
      // (note we handled assign_ref first), therefore, no need to generate duplicate.
2522
26
        dup_tensor_block_ref[i * unroll_count] = i;
2523
64
      else
2524
64
        dup_tensor_block_ref[i * unroll_count] = -1;
2525
115
    }
2526
    // Go through the original graph, make copies of the node if it is inout.
2527
44
    ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2528
44
      ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2529
44
      inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2530
44
      if (!node->outgoings)
2531
16
        continue;
2532
56
      
for (i = 0; 28
i < node->outgoings->rnum;
i++28
)
2533
28
      {
2534
28
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
2535
28
        inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2536
28
        ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2537
28
        ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2538
28
      }
2539
28
    } ccv_nnc_graph_visit_endfor
2540
    // Check the visitor are all marked as either incoming or outgoing.
2541
16
    const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2542
16
    const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2543
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2544
46
    {
2545
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2546
2
        continue;
2547
44
      assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE));
2548
      // If this is pure incoming nodes, then I need to concat this one with all original destination node
2549
44
      if (inout[i] == INCOMING_NODE)
2550
32
        
for (j = 0; 16
j < dup_destination_size;
j++16
)
2551
16
        {
2552
16
          ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2553
16
            .d = dup_destinations[j].d,
2554
16
            .graph = dup_graph,
2555
16
          }, (ccv_nnc_graph_exec_symbol_t) {
2556
16
            .d = dup_exec_ref[i * unroll_count],
2557
16
            .graph = dup_graph,
2558
16
          });
2559
16
        }
2560
44
    }
2561
16
    if (dup_graph->destinations)
2562
16
      ccv_array_clear(dup_graph->destinations);
2563
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2564
46
    {
2565
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2566
2
        continue;
2567
44
      const int d = dup_exec_ref[i * unroll_count];
2568
44
      ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d);
2569
      // If this has no outgoing node, add to the destination.
2570
44
      if (!exec_symbol_info->outgoings || 
exec_symbol_info->outgoings->rnum == 028
)
2571
16
        ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2572
16
          .graph = dup_graph,
2573
16
          .d = d,
2574
16
        });
2575
44
    }
2576
16
  }
2577
13
#undef INCOMING_NODE
2578
13
#undef OUTGOING_NODE
2579
13
  ccfree(inout);
2580
13
}
2581
2582
static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2583
13
{
2584
13
  int i;
2585
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2586
    // Now can assign them (The dup) as companion.
2587
    // Get to the last one, which we will wrap over.
2588
78
    if (dup_tensor_symbol_info[i].assign_ref)
2589
17
    {
2590
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2591
17
      dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2592
17
      assert(dup_tensor_symbol_info[i].assign_ref);
2593
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2594
17
    }
2595
13
}
2596
2597
// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2598
// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2599
// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2600
static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2601
21
{
2602
21
  int i, j, k;
2603
45
  for (i = 0; i < p_node_info->output_size; 
i++24
)
2604
24
  {
2605
24
    const int d = p_node_info->outputs[i];
2606
24
    const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx) - 1;
2607
24
    if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref]))
2608
6
      continue;
2609
36
    
for (k = 0; 18
k < destination_size;
k++18
)
2610
18
      _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2611
    // Add the duplicated destinations to the tensor_block_ref.
2612
42
    for (j = 0; j < unroll_count; 
j++24
)
2613
48
      
for (k = 0; 24
k < destination_size;
k++24
)
2614
24
      {
2615
24
        const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2616
24
        const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2617
24
        if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2618
24
          _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2619
24
      }
2620
18
  }
2621
21
}
2622
2623
static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2624
21
{
2625
21
  int i, j;
2626
21
  ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2627
21
  ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2628
  // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2629
  // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2630
  // No need to change anything, we are good.
2631
21
  const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2632
21
  if (!unroll_count)
2633
8
    return;
2634
  // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2635
  // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2636
13
  ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2637
13
  int* dup_exec_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2638
13
  int* dup_tensor_block_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2639
13
  _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2640
13
  ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2641
13
  ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2642
26
  ccv_nnc_graph_visit_t* dup_visit = 
ccv_nnc_graph_visit_new13
(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0);
2643
13
  ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2644
26
  _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2645
  // Free out the old exec_dep
2646
26
  ccv_matrix_free(exec_dep);
2647
  // and the tensor blocks, prepare for the new.
2648
26
  _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2649
  // A reverse map to find where the original tensor comes from.
2650
26
  int* dup_tensor_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2651
142
  for (i = 0; i < dup_graph->tensor_symbol_info->rnum; 
i++129
)
2652
129
    dup_tensor_from_ref[i] = -1;
2653
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2654
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2655
115
      if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2656
104
        dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2657
26
  int* dup_exec_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2658
90
  for (i = 0; i < dup_graph->exec_symbol_info->rnum; 
i++77
)
2659
77
    dup_exec_from_ref[i] = -1;
2660
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2661
35
  {
2662
35
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2663
2
      continue;
2664
33
    dup_exec_from_ref[i] = i; // Reference back.
2665
77
    for (j = 0; j < unroll_count; 
j++44
)
2666
44
      if (dup_exec_ref[i * unroll_count + j] >= 0)
2667
44
        dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2668
33
  }
2669
  // Reset all attr.
2670
26
  memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2671
26
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2672
26
  ccv_nnc_graph_visit_free(dup_visit);
2673
26
  
ccfree13
(dup_exec_symbol_info);
2674
26
  
ccfree13
(dup_exec_from_ref);
2675
26
  
ccfree13
(dup_tensor_from_ref);
2676
  // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2677
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2678
    // Loop over all possible duplications to assign dup_p_ref properly.
2679
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2680
115
    {
2681
115
      const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2682
115
      if (dup_idx >= 0 && 
(104
tensor_blocks[i].p_refs[0]104
||
tensor_blocks[i].p_refs[1]60
))
2683
44
      {
2684
44
        const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2685
44
        const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2686
44
        if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2687
28
        {
2688
28
          if (!tensor_blocks[dup_idx].dup_p_refs)
2689
22
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2690
28
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2691
28
        }
2692
44
        if (p_ref_0_is_in_or_out == 1 || 
tensor_blocks[i].p_refs[1] == 016
)
2693
44
          continue;
2694
0
        const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2695
0
        const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2696
0
        if (p_ref_1_is_in_or_out == 1)
2697
0
        {
2698
0
          if (!tensor_blocks[dup_idx].dup_p_refs)
2699
0
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2700
0
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2701
0
        }
2702
0
      }
2703
115
    }
2704
  // companion_ref
2705
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2706
    // Now can assign them (The dup) as companion.
2707
78
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
dup_tensor_symbol_info[i].assign_ref71
)
2708
17
    {
2709
      // Get to the last one, which we will wrap over.
2710
17
      const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2711
17
      if (assign_ref >= 0)
2712
17
      {
2713
17
        int b_ref = assign_ref;
2714
17
        while (tensor_blocks[b_ref].ref)
2715
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2716
17
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2717
17
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2718
        // It cannot be that both i can hop to j can j can hop to i.
2719
        // And it can be hop from one to another now after duplication.
2720
17
        assert(a_hop_b > 0 || b_hop_a > 0);
2721
17
        tensor_blocks[i].companion_ref = b_ref + 1;
2722
17
        tensor_blocks[b_ref].companion_ref = i + 1;
2723
17
      }
2724
17
    }
2725
13
  ccfree(dup_tensor_symbol_info);
2726
  // Extend the dup tensor block ref, prepare for future extensions.
2727
13
  dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2728
110
  for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; 
i++97
)
2729
97
    dup_tensor_block_ref[i] = -1;
2730
  // Assign out changed properties.
2731
13
  *r_exec_dep = exec_dep;
2732
13
  *r_tensor_blocks = tensor_blocks;
2733
13
  *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2734
13
  *r_dup_graph = dup_graph;
2735
13
  *r_unroll_count = unroll_count;
2736
13
  *r_dup_exec_ref = dup_exec_ref;
2737
13
  *r_dup_tensor_block_ref = dup_tensor_block_ref;
2738
13
}
2739
2740
static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2741
31
{
2742
31
  if (!anonymous_block_free_list || 
!anonymous_block_free_list_cap15
)
2743
28
    return tensor_block_size;
2744
3
  int i;
2745
3
  const int no_dup_p_refs = (!dup_p_refs || 
!dup_p_refs->rnum0
);
2746
3
  int found_idx = tensor_block_size;
2747
9
  for (i = 0; i < anonymous_block_free_list_cap; 
i++6
)
2748
7
  {
2749
7
    const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i);
2750
7
    assert(idx < tensor_block_size);
2751
    // If the type doesn't match, ignore.
2752
7
    if (tensor_blocks[idx].type != type)
2753
0
      continue;
2754
    // Heuristic about how to select the best tensor block to move forward.
2755
    // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
2756
7
    if (tensor_blocks[idx].size >= size)
2757
1
    {
2758
1
      if (no_dup_p_refs)
2759
1
        return idx;
2760
      // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
2761
      // then we cannot do better than this, if that is the case, just return.
2762
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
2763
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
2764
0
        return idx;
2765
0
    }
2766
6
    int64_t found_idx_size_diff;
2767
6
    int64_t idx_size_diff;
2768
6
    if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
2769
      // Now, compare whether this one or the found_idx one is better.
2770
      // At this point, there is no point of comparing the dup_p_refs, we only care about which one
2771
      // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
2772
6
      
(found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size))4
)
2773
3
    {
2774
3
      found_idx = idx;
2775
3
      continue;
2776
3
    }
2777
    // No need to update if found_idx is better than idx.
2778
3
    if (found_idx_size_diff > idx_size_diff)
2779
0
      continue;
2780
    // We bias towards the bigger one in case of similar.
2781
3
    if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
2782
0
    {
2783
0
      found_idx = idx;
2784
0
      continue;
2785
0
    }
2786
3
    assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size);
2787
    // On a tie, check which one has tighter life-cycle.
2788
3
    if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
2789
0
    {
2790
      // Check whether the current tensor blocks life-cycle is longer than the previous one.
2791
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
2792
0
        (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
2793
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
2794
0
        found_idx = idx;
2795
0
      continue;
2796
0
    }
2797
    // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
2798
    // We prefer to choose the one that has life-cycle closer to the expected ones.
2799
3
    if (no_dup_p_refs)
2800
3
    {
2801
      // Whoever is shorter wins.
2802
3
      if (tensor_blocks[found_idx].dup_p_refs && 
tensor_blocks[found_idx].dup_p_refs->rnum > 00
&&
2803
3
        
(0
!tensor_blocks[idx].dup_p_refs0
||
!tensor_blocks[idx].dup_p_refs->rnum0
||
2804
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
2805
0
        found_idx = idx;
2806
3
      continue;
2807
3
    }
2808
0
    if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
2809
0
      continue;
2810
0
    if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
2811
0
    {
2812
0
      found_idx = idx;
2813
0
      continue;
2814
0
    }
2815
    // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
2816
0
    const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
2817
0
    const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
2818
0
    if (idx_after_request && found_idx_after_request)
2819
0
    {
2820
0
      if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
2821
0
        found_idx = idx;
2822
0
      continue;
2823
0
    } else {
2824
      // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
2825
      // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
2826
      // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
2827
0
      if (!found_idx_after_request && (idx_after_request ||
2828
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
2829
0
        found_idx = idx;
2830
0
      continue;
2831
0
    }
2832
0
  }
2833
2
  return found_idx;
2834
3
}
2835
2836
static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
2837
49
{
2838
49
  if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
2839
28
    return 0;
2840
21
  int i, j, k;
2841
21
  int input_size = 0;
2842
43
  for (i = 0; i < p_node_info->p_while.input_size; 
i++22
)
2843
22
    if (p_node_info->p_while.inputs[i] >= 0)
2844
2
      ++input_size;
2845
  // If doesn't have tensor inputs (thus, only special inputs), just return.
2846
21
  if (!input_size)
2847
19
    return 0;
2848
2
  ccv_nnc_tensor_symbol_t inputs[input_size];
2849
2
  input_size = 0;
2850
6
  for (i = 0; i < p_node_info->p_while.input_size; 
i++4
)
2851
4
    if (p_node_info->p_while.inputs[i] >= 0)
2852
2
      inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
2853
2
        .d = p_node_info->p_while.inputs[i],
2854
2
        .graph = symbolic_graph,
2855
2
      };
2856
2
  assert(symbolic_graph->breakpoint_size > 0);
2857
2
  ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
2858
2
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
2859
4
  for (i = 0; i < symbolic_graph->breakpoint_size; 
i++2
)
2860
2
  {
2861
    // Make a noop copy of the breakpoint, but with some tensor inputs.
2862
2
    ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), inputs, input_size, 0, 0, 0);
2863
2
    ccv_array_push(dup_breakpoints, &noop);
2864
    // Connect this noop to the outgoing nodes of breakpoints.
2865
2
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d);
2866
2
    if (symbol_info->outgoings)
2867
4
      
for (j = 0; 2
j < symbol_info->outgoings->rnum;
j++2
)
2868
2
      {
2869
2
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
2870
2
        ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
2871
2
          .d = d,
2872
2
          .graph = symbolic_graph,
2873
2
        });
2874
2
      }
2875
2
  }
2876
7
  for (i = 0; i < exec_symbol_info_size; 
i++5
)
2877
5
  {
2878
5
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
2879
5
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags))
2880
0
      continue;
2881
5
    if (symbol_info->outgoings)
2882
3
    {
2883
3
      const int outgoing_size = symbol_info->outgoings->rnum;
2884
6
      for (j = 0; j < outgoing_size; 
j++3
)
2885
3
      {
2886
3
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
2887
6
        for (k = 0; k < symbolic_graph->breakpoint_size; 
k++3
)
2888
3
          if (d == symbolic_graph->breakpoints[k].d)
2889
0
          {
2890
0
            ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k);
2891
0
            ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
2892
0
              .d = i,
2893
0
              .graph = symbolic_graph,
2894
0
            }, noop);
2895
            // Found, connected, exit.
2896
0
            break;
2897
0
          }
2898
3
      }
2899
3
    }
2900
5
  }
2901
  // Add the dup_breakpoints to source if neccessary.
2902
2
  assert(symbolic_graph->sources);
2903
2
  const int source_size = symbolic_graph->sources->rnum;
2904
4
  for (i = 0; i < source_size; 
i++2
)
2905
2
  {
2906
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i))->d;
2907
2
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++0
)
2908
2
      if (d == symbolic_graph->breakpoints[j].d)
2909
2
      {
2910
2
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
2911
2
        ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
2912
        // Found, made, exit.
2913
2
        break;
2914
2
      }
2915
2
  }
2916
  // Add the dup_breakpoints to destination if neccessary.
2917
2
  assert(symbolic_graph->destinations);
2918
2
  const int destination_size = symbolic_graph->destinations->rnum;
2919
4
  for (i = 0; i < destination_size; 
i++2
)
2920
2
  {
2921
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i))->d;
2922
4
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++2
)
2923
2
      if (d == symbolic_graph->breakpoints[j].d)
2924
0
      {
2925
0
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
2926
0
        ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
2927
        // Found, made, exit.
2928
0
        break;
2929
0
      }
2930
2
  }
2931
2
  return dup_breakpoints;
2932
2
}
2933
2934
// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
2935
static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
2936
6.10k
{
2937
6.10k
  assert(source_size > 0);
2938
6.10k
  assert(destination_size > 0);
2939
  // First, fill all the "auto" holes.
2940
  // This is the symbol table that with "auto" info filled up.
2941
6.10k
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
2942
6.10k
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
2943
6.10k
  ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
2944
12.2k
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new6.10k
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
2945
0
  ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
2946
12.2k
  int i, j, k, p, q;
2947
12.2k
  const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = 
p_exec_symbol_info6.10k
?
p_exec_symbol_info + (symbolic_graph->exec_idx - 1)49
:
06.05k
;
2948
12.2k
  ccv_sparse_matrix_t* exec_dep;
2949
12.2k
  ccv_nnc_tensor_block_t* tensor_blocks;
2950
12.2k
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
2951
12.2k
  int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2952
  // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
2953
  // are automatically filled in, and all the sub-graphs are processed.
2954
  // There is a last step though, for a while loop, it is parameterized:
2955
  // while (x > 5) {
2956
  //     y = x + 1;
2957
  // } (y => x) // This means after this loop is done, y's value will be copied over to x.
2958
  // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
2959
  // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
2960
  // it is a inplace operation.
2961
  // But if y cannot be x's alias, for example, this while loop looks like this:
2962
  // while (x > 5) {
2963
  //     y = x + a
2964
  //     b = x + y
2965
  // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
2966
  // For this example, y cannot be x's alias because x is used later to compute b (and that computation
2967
  // has dependency on y as well).
2968
  // For this case, we need to modify the computation graph. Previously, the graph looks like this:
2969
  // y = x + a -> b = x + y
2970
  // This graph will be extended to look like this:
2971
  // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
2972
  // while (x0 > 5) {
2973
  //     y0 = x0 + a0
2974
  //     b0 = x0 + y0
2975
  //     if (y0 > 5) break
2976
  //     y1 = y0 + b0
2977
  //     b1 = y0 + y1
2978
  // } (y1 => x0, b1 => a0)
2979
  // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
2980
  // with each other now).
2981
  // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
2982
  // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
2983
12.2k
  ccv_nnc_symbolic_graph_t* dup_graph = 0;
2984
12.2k
  int* dup_exec_ref = 0;
2985
12.2k
  int* dup_tensor_block_ref = 0;
2986
12.2k
  int unroll_count = 0;
2987
  // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
2988
12.2k
  ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)
ccmalloc6.10k
(sizeof(ccv_nnc_symbolic_graph_prep_t));
2989
12.2k
  prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
2990
12.2k
  prep->flags = 0;
2991
  // Cannot handle dup a node that is a graph as well.
2992
12.2k
  if (
p_exec_symbol_info6.10k
)
2993
49
  {
2994
49
    prep->flags = p_node_info->flags;
2995
49
    if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
2996
21
    {
2997
21
      _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
2998
21
      _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
2999
28
    } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3000
      // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3001
28
    }
3002
49
  }
3003
12.2k
  ccv_nnc_symbolic_graph_prep_t** sub_preps = 
symbolic_graph->sub_graphs6.10k
&&
symbolic_graph->sub_graphs->rnum29
?
(ccv_nnc_symbolic_graph_prep_t**)29
cccalloc29
(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) :
06.07k
;
3004
12.2k
  ccv_array_t* anonymous_block_free_list = 0;
3005
12.2k
  const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3006
  // Record whether this tensor is folded in this round.
3007
12.2k
  uint32_t* const tensor_fold = (uint32_t*)
ccmalloc6.10k
(sizeof(uint32_t) * tensor_fold_size);
3008
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
3009
31.8k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3010
49
    {
3011
49
      assert(symbolic_graph->sub_graphs);
3012
49
      ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1);
3013
49
      ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3014
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3015
49
      sub_prep->dup_breakpoints = dup_breakpoints;
3016
49
      sub_prep->p = prep;
3017
49
      sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1] = sub_prep;
3018
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3019
49
      const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3020
296
      for (i = 0; i < s_alloc_prep->block_size; 
i++247
)
3021
247
      {
3022
247
        const int block_ref = s_alloc_prep->blocks[i].block_ref;
3023
247
        const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3024
247
        if (block_ref < sub_prep->tensor_symbol_info_size)
3025
192
        {
3026
          // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3027
          // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3028
192
          if (s_tensor_blocks[block_ref].bypass_ref)
3029
1
          {
3030
1
            int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3031
1
            while (s_tensor_blocks[bypass_ref].ref)
3032
0
              bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3033
1
            if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3034
1
              
s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1]0
)
3035
1
              continue;
3036
1
          }
3037
191
          if (s_tensor_blocks[block_ref].p_refs[0])
3038
91
          {
3039
            /* If it is already properly assigned, next. */
3040
91
            if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3041
91
              s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3042
91
            {
3043
91
              if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3044
90
                s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3045
1
              else {
3046
1
                assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3047
1
                s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3048
1
              }
3049
91
            }
3050
            /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3051
91
            if (s_tensor_blocks[block_ref].p_refs[1] &&
3052
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1]3
&&
3053
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1]3
)
3054
3
            {
3055
3
              assert(s_alloc_prep->buffers[buffer_ref].p_refs[0]);
3056
3
              assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3057
3
              s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3058
3
            }
3059
91
          }
3060
191
        } else 
if (55
s_tensor_blocks[block_ref].dup_p_refs55
) {
3061
          /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3062
           * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3063
           * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3064
           * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3065
           * its life-time to the end of the output tensor. */
3066
15
          if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3067
13
            s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3068
33
          for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; 
j++18
)
3069
18
            ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j));
3070
15
        }
3071
247
      }
3072
49
    }
3073
31.8k
    const int init_tensor_block_size = tensor_block_size;
3074
31.8k
    int rw_anonymous_buffer_size_cap = 0;
3075
31.8k
    int ro_anonymous_buffer_size_cap = 0;
3076
31.8k
    if (anonymous_block_free_list)
3077
17
      ccv_array_clear(anonymous_block_free_list);
3078
31.8k
    memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3079
31.8k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3080
49
    {
3081
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1];
3082
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3083
49
      int rw_anonymous_buffer_size = 0;
3084
49
      int ro_anonymous_buffer_size = 0;
3085
229
      for (i = 0; i < s_alloc_prep->buffer_size; 
i++180
)
3086
180
        if (s_alloc_prep->buffers[i].p_refs[0])
3087
90
        {
3088
          /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3089
90
          int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3090
          /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3091
90
          int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3092
90
          assert(p_ref_0_is_in_or_out != 0);
3093
90
          int unref_p_ref_0 = p_ref_0;
3094
92
          while (tensor_blocks[unref_p_ref_0].ref)
3095
2
            unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3096
          /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3097
90
          assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3098
90
          if (s_alloc_prep->buffers[i].p_refs[1])
3099
4
          {
3100
4
            int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3101
4
            const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3102
4
            assert(p_ref_1_is_in_or_out != 0);
3103
4
            int unref_p_ref_1 = p_ref_1;
3104
4
            while (tensor_blocks[unref_p_ref_1].ref)
3105
0
              unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3106
            /* See above comment for the similar p_ref_0 check. */
3107
4
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]));
3108
4
            assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out);
3109
4
            int p_ref_t;
3110
4
            if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3111
3
            {
3112
3
              CCV_SWAP(p_ref_0, p_ref_1, p_ref_t);
3113
3
              CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t);
3114
3
            }
3115
4
            p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3116
            /* If the dimension matches, can fold. */
3117
4
            if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
3118
4
            {
3119
4
              const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3120
4
              if (folded)
3121
1
              {
3122
1
                p_ref_0 = p_ref_1;
3123
1
                unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3124
1
                tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3125
1
                for (j = 0; j < unroll_count; 
j++0
) /* Fold its duplicates as well. */
3126
0
                {
3127
0
                  const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3128
0
                  assert(folded && "the subsequent duplicates can be folded too.");
3129
0
                }
3130
1
              }
3131
4
            }
3132
4
          }
3133
          /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3134
           * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3135
           * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3136
           * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3137
           * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3138
           * associated with it, then we are good. */
3139
90
          if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3140
90
            
(89
p_ref_0_is_in_or_out == 189
&&
_ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)50
) ||
3141
90
            
(39
p_ref_0_is_in_or_out == -139
&&
_ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)39
) ||
3142
90
            
TENSOR_READ_WRITE8
(s_alloc_prep->buffers[i]) == READ_ONLY8
)
3143
86
          {
3144
86
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3145
27
              { assert(s_alloc_prep->buffers[i].p_refs[1] == 0); }
3146
            /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3147
             * is a long argument why that is the case, the digest is, it is much easier to control your output
3148
             * than your input). */
3149
86
            s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3150
86
            s_alloc_prep->buffers[i].p_refs[1] = 0;
3151
            /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3152
86
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3153
86
            tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size);
3154
95
            for (j = 0; j < unroll_count; 
j++9
) /* Change the size of its duplicates as well. */
3155
9
              tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3156
9
                tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3157
9
                  tensor_blocks[unref_p_ref_0].size;
3158
86
          } else {
3159
4
            s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3160
4
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3161
0
              ++ro_anonymous_buffer_size;
3162
4
            else
3163
4
              rw_anonymous_buffer_size += unroll_count + 1;
3164
4
          }
3165
90
        } else {
3166
90
          if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3167
63
            ++ro_anonymous_buffer_size;
3168
27
          else
3169
27
            rw_anonymous_buffer_size += unroll_count + 1;
3170
90
        }
3171
49
      if (ro_anonymous_buffer_size || 
rw_anonymous_buffer_size24
)
3172
28
      {
3173
28
        const int anonymous_block_free_list_cap = anonymous_block_free_list ? 
anonymous_block_free_list->rnum6
:
022
;
3174
        // All read-write buffer (potentially) can be reused between each case..of branch.
3175
28
        rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3176
        // Read-only buffer cannot be reused between each case..of branch.
3177
28
        ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3178
        /* Anonymous block, allocate additional tensor blocks for this. */
3179
        /* This is either because this is an internal tensor (don't have p_ref) */
3180
        /* or it is an anonymous block itself within the sub graphs of this while graph. */
3181
28
        tensor_blocks = (ccv_nnc_tensor_block_t*)ccrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3182
28
        memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3183
28
        if (dup_tensor_block_ref)
3184
3
          dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3185
174
        for (i = 0; i < s_alloc_prep->buffer_size; 
i++146
)
3186
146
          if (!s_alloc_prep->buffers[i].p_refs[0])
3187
94
          {
3188
94
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3189
63
            {
3190
63
              assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap);
3191
63
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size]);
3192
63
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3193
63
              tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3194
63
              tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3195
63
              tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3196
63
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3197
63
              tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3198
63
              ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3199
63
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3200
63
              if (dup_p_refs && 
dup_p_refs->rnum > 00
)
3201
0
              {
3202
0
                for (j = 0; j < dup_p_refs->rnum; j++)
3203
0
                {
3204
0
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3205
0
                  assert(dup_p_ref >= 0);
3206
0
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3207
0
                  assert(tensor_blocks[dup_p_ref].tail);
3208
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3209
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3210
0
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3211
0
                  {
3212
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3213
0
                    assert(p_node_info);
3214
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3215
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3216
0
                    {
3217
0
                      if (!tensor_blocks[tensor_block_size].dup_p_refs)
3218
0
                        tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3219
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3220
0
                    }
3221
0
                  }
3222
0
                  if (!tensor_blocks[tensor_block_size].tail)
3223
0
                    tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3224
0
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3225
0
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_size]);
3226
0
                }
3227
63
              } else {
3228
63
                tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3229
63
                ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3230
63
              }
3231
132
              
for (j = 0; 63
j < source_size;
j++69
)
3232
69
                _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3233
              /* If this is a read-only (based on SSA, if first encountered as read), and this is
3234
               * sub-graph. Mark it to the end of the graph. */
3235
63
              if (p_exec_symbol_info)
3236
12
                
for (j = 0; 6
j < destination_size;
j++6
)
3237
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3238
              /* If it is read-only, it is self-reflecting. */
3239
69
              for (k = 0; k < unroll_count; 
k++6
)
3240
6
              {
3241
12
                for (j = 0; j < destination_size; 
j++6
)
3242
6
                  if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3243
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3244
                /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3245
6
                assert(symbolic_graph->p);
3246
6
                dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3247
6
              }
3248
63
              ++tensor_block_size;
3249
63
            } else {
3250
31
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3251
31
              const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3252
31
              const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3253
              // Find suitable tensor block from the free list.
3254
31
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3255
31
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3256
31
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3257
31
              if (new_anonymous_tensor_block)
3258
28
              {
3259
28
                tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3260
28
                tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3261
28
                tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3262
28
                tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3263
28
                ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3264
28
              } else {
3265
3
                tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3266
3
                tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3267
3
              }
3268
31
              if (dup_p_refs && 
dup_p_refs->rnum > 04
)
3269
4
              {
3270
8
                for (j = 0; j < dup_p_refs->rnum; 
j++4
)
3271
4
                {
3272
4
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3273
4
                  assert(dup_p_ref >= 0);
3274
4
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3275
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3276
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3277
4
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3278
0
                  {
3279
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3280
0
                    assert(p_node_info);
3281
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3282
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3283
0
                    {
3284
0
                      if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3285
0
                        tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3286
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3287
0
                    }
3288
0
                  }
3289
4
                  assert(tensor_blocks[dup_p_ref].tail);
3290
4
                  if (!tensor_blocks[tensor_block_idx].tail)
3291
4
                    tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3292
8
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; 
k++4
)
3293
4
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_idx]);
3294
                  // We have to add it to the warp around companion_ref as well.
3295
                  // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3296
                  // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3297
                  // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3298
                  // gaurantee may be broken down in the line.
3299
4
                  if (tensor_blocks[dup_p_ref].companion_ref)
3300
0
                  {
3301
0
                    const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3302
0
                    for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3303
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3304
0
                    for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3305
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3306
0
                  }
3307
4
                }
3308
27
              } else if (new_anonymous_tensor_block) {
3309
24
                tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3310
24
                ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3311
24
              }
3312
31
              const int prev_tensor_block_idx = tensor_block_idx;
3313
31
              if (new_anonymous_tensor_block)
3314
28
              {
3315
28
                if (!anonymous_block_free_list)
3316
16
                  anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3317
28
                ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3318
28
                ++tensor_block_size;
3319
28
              }
3320
32
              for (k = 0; k < unroll_count; 
k++1
)
3321
1
              {
3322
1
                const int tensor_block_idx = new_anonymous_tensor_block ?
3323
1
                  (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3324
1
                  
dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k]0
;
3325
1
                TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3326
1
                TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3327
1
                if (new_anonymous_tensor_block)
3328
1
                {
3329
1
                  tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3330
1
                  tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3331
1
                  tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3332
1
                  tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3333
                  /* Attach to duplicated exec for this tensor block. */
3334
1
                  ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3335
1
                } else {
3336
0
                  tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3337
0
                  tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3338
0
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3339
3340
0
                }
3341
1
                if (dup_p_refs && dup_p_refs->rnum > 0)
3342
1
                {
3343
                  /* Not nil, not self-reflecting. */
3344
2
                  for (j = 0; j < dup_p_refs->rnum; 
j++1
)
3345
1
                  {
3346
1
                    const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3347
1
                    assert(dup_p_ref >= 0);
3348
1
                    assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3349
                    // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3350
                    // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3351
1
                    if (tensor_symbol_info[dup_p_ref].p_ref)
3352
0
                    {
3353
0
                      const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3354
0
                      assert(p_node_info);
3355
0
                      const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3356
0
                      if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3357
0
                      {
3358
0
                        if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3359
0
                          tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3360
0
                        ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3361
0
                      }
3362
0
                    }
3363
1
                    assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref);
3364
1
                    const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3365
1
                    assert(tensor_blocks[dup_dup_p_ref].tail);
3366
1
                    if (!tensor_blocks[tensor_block_idx].tail)
3367
1
                      tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3368
2
                    for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; 
q++1
)
3369
1
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q), tensor_blocks[tensor_block_idx]);
3370
                    // We have to add it to the warp around companion_ref as well.
3371
1
                    if (tensor_blocks[dup_dup_p_ref].companion_ref)
3372
0
                    {
3373
0
                      const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3374
0
                      for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3375
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3376
0
                      for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3377
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3378
0
                    }
3379
1
                  }
3380
1
                } else 
if (0
new_anonymous_tensor_block0
) {
3381
0
                  tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3382
0
                  ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3383
0
                }
3384
1
                if (new_anonymous_tensor_block)
3385
1
                  ++tensor_block_size;
3386
1
              }
3387
31
            }
3388
94
          }
3389
28
      }
3390
49
    }
3391
31.8k
  } ccv_nnc_graph_visit_endfor
3392
6.10k
  if (anonymous_block_free_list)
3393
16
    ccv_array_free(anonymous_block_free_list);
3394
6.10k
  ccfree(tensor_fold);
3395
  // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3396
  // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3397
6.10k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new(exec_dep, tensor_blocks, tensor_block_size);
3398
6.10k
  ccv_matrix_free(exec_dep);
3399
6.10k
  prep->while_count_tensor = 0;
3400
6.10k
  prep->dup_breakpoints = 0;
3401
6.10k
  prep->p = 0;
3402
6.10k
  prep->symbolic_graph = symbolic_graph;
3403
6.10k
  prep->p_idx = symbolic_graph->p_idx;
3404
6.10k
  prep->exec_idx = symbolic_graph->exec_idx;
3405
6.10k
  prep->sub_prep_size = symbolic_graph->sub_graphs ? 
symbolic_graph->sub_graphs->rnum29
:
06.07k
;
3406
6.10k
  prep->sub_preps = sub_preps;
3407
6.10k
  prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3408
6.10k
  prep->exec_symbol_info = exec_symbol_info;
3409
6.10k
  prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3410
6.10k
  prep->tensor_symbol_info = tensor_symbol_info;
3411
6.10k
  prep->unroll_count = unroll_count;
3412
6.10k
  prep->dup_tensor_block_ref = dup_tensor_block_ref;
3413
6.10k
  prep->tensor_block_size = tensor_block_size;
3414
6.10k
  prep->tensor_blocks = tensor_blocks;
3415
6.10k
  prep->exec_flags = exec_flags;
3416
6.10k
  prep->visit = visit;
3417
6.10k
  prep->alloc_prep = alloc_prep;
3418
6.10k
  if (dup_graph)
3419
13
    ccv_nnc_symbolic_graph_free(dup_graph);
3420
6.10k
  if (dup_exec_ref)
3421
13
    ccfree(dup_exec_ref);
3422
6.10k
  return prep;
3423
12.2k
}
3424
3425
static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3426
6.10k
{
3427
6.10k
  int i;
3428
6.10k
  _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3429
6.10k
  ccfree(prep->exec_flags);
3430
6.15k
  for (i = 0; i < prep->sub_prep_size; 
i++50
)
3431
50
    if (prep->sub_preps[i])
3432
49
      _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3433
6.10k
  if (prep->sub_preps)
3434
29
    ccfree(prep->sub_preps);
3435
6.10k
  ccfree(prep->tensor_symbol_info);
3436
6.10k
  ccfree(prep->exec_symbol_info);
3437
6.10k
  if (prep->dup_tensor_block_ref)
3438
13
    ccfree(prep->dup_tensor_block_ref);
3439
6.10k
  _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3440
6.10k
  ccv_nnc_graph_visit_free(prep->visit);
3441
6.10k
  ccfree(prep);
3442
6.10k
}
3443
3444
static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3445
6.10k
{
3446
6.10k
  int i, j;
3447
31.8k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
3448
31.8k
    if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3449
21
    {
3450
21
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3451
21
      assert(graph_ref >= 0);
3452
21
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3453
43
      for (i = 0; i < node->p_while.input_size; 
i++22
)
3454
22
        if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i]))
3455
20
        {
3456
20
          ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3457
20
          const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i]);
3458
21
          for (j = 0; j < d; 
j++1
)
3459
1
            prep = prep->p;
3460
20
          prep->while_count_tensor = 1;
3461
20
        }
3462
21
    }
3463
31.8k
    
for (i = 0; 31.8k
i < node->graph_ref_size;
i++49
)
3464
49
    {
3465
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3466
49
      if (graph_ref >= 0)
3467
49
        _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3468
49
    }
3469
31.8k
  } ccv_nnc_graph_visit_endfor
3470
6.10k
}
3471
3472
static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3473
89.9k
{
3474
89.9k
  if (symbol >= 0)
3475
65.6k
    return graph_prep->tensor_arena->vt_tensors[symbol];
3476
24.2k
  if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3477
24.2k
    return 0;
3478
20
  assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol));
3479
20
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3480
20
  int i;
3481
20
  const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol);
3482
21
  for (i = 0; i < d; 
i++1
)
3483
1
    prep = prep->p;
3484
20
  assert(prep->while_count_tensor);
3485
20
  return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3486
20
}
3487
3488
static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3489
6.10k
{
3490
6.10k
  int i;
3491
6.10k
  int* const exec_cvt = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
3492
6.10k
  ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3493
6.10k
  graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3494
6.10k
  graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3495
6.10k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3496
58.2k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++52.0k
)
3497
52.0k
    if (graph_execs[i].graph == graph)
3498
31.8k
      graph_execs[i].d = exec_cvt[graph_execs[i].d];
3499
6.10k
  ccfree(exec_cvt);
3500
6.10k
}
3501
3502
static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3503
6.10k
{
3504
6.10k
  int i, j, k;
3505
6.10k
  ccv_nnc_graph_t* const graph = graph_prep->graph;
3506
6.10k
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3507
6.10k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
3508
6.10k
  graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3509
6.10k
  graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3510
6.10k
  graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3511
6.10k
  graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3512
6.10k
  memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3513
6.10k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3514
6.10k
  int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3515
58.2k
  for (i = 0; i < exec_symbol_info_size; 
i++52.0k
)
3516
52.0k
  {
3517
52.0k
    max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size);
3518
52.0k
    max_output_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].output_size);
3519
52.0k
    if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3520
22
      max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size);
3521
52.0k
    graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3522
52.0k
    graph_execs[i].graph = 0;
3523
52.0k
  }
3524
6.15k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
3525
50
    max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size);
3526
6.10k
  ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)];
3527
6.10k
  ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)];
3528
6.10k
  ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)];
3529
6.10k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3530
6.10k
  const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3531
  // Create node, this is in topological order.
3532
31.8k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3533
31.8k
    if (CCV_NO_GRAPH_EXEC(graph_execs[idx]))
3534
31.8k
    {
3535
121k
      for (i = 0; i < node->input_size; 
i++89.9k
)
3536
89.9k
        max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3537
81.6k
      for (i = 0; i < node->output_size; 
i++49.8k
)
3538
49.8k
        max_outputs[i] = node->outputs[i] >= 0 ? 
tensor_arena->vt_tensors[node->outputs[i]]43.0k
:
06.87k
;
3539
31.8k
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3540
21
      {
3541
21
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3542
21
        assert(graph_ref >= 0);
3543
21
        ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3544
21
        ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3545
21
        graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3546
21
        const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3547
21
        ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3548
21
        ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3549
43
        for (i = 0; i < node->p_while.input_size; 
i++22
)
3550
22
          max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3551
42
        for (i = 0; i < sub_symbolic_graph->breakpoint_size; 
i++21
)
3552
21
          max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3553
21
        ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3554
21
        _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3555
31.7k
      } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3556
24
        for (i = 0; i < node->output_size; 
i++13
)
3557
13
          if (max_outputs[i] && max_outputs[i]->alias_ref)
3558
10
            max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3559
11
        graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3560
        // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3561
22
        for (i = 0; i < node->case_of.argument.offset; 
i++11
)
3562
11
        {
3563
11
          ccv_nnc_tensor_t* const update = max_inputs[i];
3564
11
          if (!CCV_IS_TENSOR_MULTIVIEW(update)) // No need if it is a naked tensor.
3565
9
            continue;
3566
2
          int flag = 0;
3567
2
          for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; 
j++0
)
3568
0
            flag = (update == max_inputs[j]);
3569
2
          if (!flag)
3570
2
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3571
2
        }
3572
11
        const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 
11
:
010
;
3573
11
        ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3574
11
        if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3575
1
        {
3576
          // Add another graph for data transfer.
3577
1
          ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3578
2
          for (i = 0; i < node->output_size; 
i++1
)
3579
1
            max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 
00
;
3580
1
          ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size), max_outputs, ccv_min(node->input_size, node->output_size));
3581
1
          ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3582
1
          ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3583
1
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3584
1
          int exec_cvt;
3585
1
          ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3586
1
        }
3587
39
        for (i = 0; i < node->graph_ref_size; 
i++28
)
3588
28
        {
3589
28
          const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3590
28
          if (graph_ref < 0)
3591
0
            continue;
3592
28
          ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3593
28
          const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3594
28
          ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3595
28
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3596
28
          _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3597
28
        }
3598
31.7k
      } else {
3599
31.7k
        graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3600
31.7k
      }
3601
31.8k
      ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3602
31.8k
    }
3603
31.8k
  } ccv_nnc_graph_visit_endfor
3604
  // Then connect them.
3605
31.8k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3606
31.8k
    if (node->outgoings)
3607
53.0k
      
for (i = 0; 25.0k
i < node->outgoings->rnum;
i++27.9k
)
3608
27.9k
      {
3609
27.9k
        const int outgoing = *(int*)ccv_array_get(node->outgoings, i);
3610
27.9k
        if (graph_execs[outgoing].graph)
3611
27.3k
          ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3612
27.9k
      }
3613
31.8k
  } ccv_nnc_graph_visit_endfor
3614
6.10k
  int source_exec_created = 0;
3615
6.10k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3616
6.10k
  const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3617
6.10k
  ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3618
  // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3619
100k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++94.0k
)
3620
94.0k
  {
3621
94.0k
    if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags))
3622
127
    {
3623
127
      int ref = i;
3624
127
      while (tensor_symbol_info[ref].alias_ref)
3625
0
        ref = tensor_symbol_info[ref].alias_ref - 1;
3626
127
      while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref46
)
3627
0
        ref = tensor_blocks[ref].ref - 1;
3628
      // This is not computable. It could be that we marked a const tensor as init zero.
3629
127
      if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]))
3630
46
        continue;
3631
      // If this tensor is not used by any exec, we don't need to init at all. Skip.
3632
81
      if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3633
0
        continue;
3634
81
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3635
      // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3636
81
      ccv_nnc_graph_exec_t set_exec;
3637
81
      if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3638
27
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3639
54
      else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3640
54
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3641
162
      for (j = 0; j < tensor_blocks[ref].head->rnum; 
j++81
)
3642
81
      {
3643
81
        const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j);
3644
81
        if (outgoing >= exec_symbol_info_size)
3645
0
          continue;
3646
81
        assert(outgoing >= 0);
3647
81
        assert(graph_execs[outgoing].graph);
3648
81
        ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3649
81
      }
3650
81
      int flags = 0;
3651
81
      if (alloc_dep[ref])
3652
24
        
for (j = 0; 12
j < alloc_dep[ref]->rnum;
j++12
)
3653
12
        {
3654
12
          const int d = *(int*)ccv_array_get(alloc_dep[ref], j);
3655
          // This is from alloc_dep, it should be computable.
3656
12
          assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
3657
12
          if (tensor_blocks[d].tail)
3658
24
            
for (k = 0; 12
k < tensor_blocks[d].tail->rnum;
k++12
)
3659
12
            {
3660
12
              const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
3661
12
              if (incoming >= exec_symbol_info_size)
3662
0
                continue;
3663
12
              assert(incoming >= 0);
3664
12
              assert(graph_execs[incoming].graph);
3665
12
              ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3666
12
              flags = 1;
3667
12
            }
3668
12
        }
3669
      // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3670
81
      if (!flags)
3671
69
      {
3672
69
        if (!source_exec_created)
3673
39
        {
3674
39
          graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3675
39
          source_exec_created = 1;
3676
39
        }
3677
69
        ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3678
69
      }
3679
81
    }
3680
94.0k
  }
3681
  // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3682
  // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3683
  // with its alias).
3684
6.10k
  assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size);
3685
100k
  
for (i = 0; 6.10k
i < tensor_arena->vt_tensor_size;
i++94.0k
)
3686
94.0k
  {
3687
94.0k
    ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3688
    // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3689
94.0k
    if (mv && 
CCV_IS_TENSOR_MULTIVIEW84.4k
(mv))
3690
53
    {
3691
53
      const ccv_array_t* const head = tensor_blocks[i].head;
3692
53
      if (head && 
head->rnum > 047
)
3693
94
        
for (j = 0; 47
j < head->rnum;
j++47
)
3694
47
        {
3695
47
          const int idx = *(int*)ccv_array_get(head, j);
3696
47
          if (idx >= exec_symbol_info_size)
3697
1
            continue;
3698
46
          assert(idx >= 0);
3699
46
          const int d = graph_execs[idx].d;
3700
46
          ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d);
3701
46
          int flag = 0;
3702
46
          if (exec_info->tensor_wraps_ref)
3703
32
          {
3704
32
            ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1);
3705
113
            for (k = 0; k < tensor_wrap_array->size && 
!flag88
;
k++81
)
3706
81
              flag = (tensor_wrap_array->tensor_wraps[k] && 
tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv55
);
3707
32
          }
3708
          // If none is in the flag, it need to be included in the cast.
3709
46
          if (!flag)
3710
19
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3711
46
        }
3712
53
    }
3713
94.0k
  }
3714
  // Create source / destination phony node. This is to facilitate use of compiled graph.
3715
  // Also, this is needed if you have init zero execs.
3716
6.10k
  if (source_exec_created || 
source_size > 16.06k
)
3717
110
  {
3718
110
    if (!source_exec_created)
3719
71
      graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3720
495
    for (i = 0; i < source_size; 
i++385
)
3721
385
      ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3722
5.99k
  } else {
3723
5.99k
    assert(!source_exec_created);
3724
5.99k
    assert(source_size == 1);
3725
5.99k
    graph_exec_arena->source = graph_execs[sources[0].d];
3726
5.99k
  }
3727
6.10k
  if (destination_size == 1)
3728
6.02k
    graph_exec_arena->destination = graph_execs[destinations[0].d];
3729
81
  else {
3730
81
    graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3731
1.06k
    for (i = 0; i < destination_size; 
i++982
)
3732
982
      ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3733
81
  }
3734
6.10k
  ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3735
6.10k
  ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3736
6.10k
  return graph_exec_arena;
3737
6.10k
}
3738
3739
static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3740
11
{
3741
11
  if (graph_prep->symbolic_graph == pair)
3742
4
    return graph_prep->graph;
3743
7
  int i;
3744
10
  for (i = 0; i < graph_prep->sub_prep_size; 
i++3
)
3745
7
    if (graph_prep->sub_preps[i])
3746
7
    {
3747
7
      ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
3748
7
      if (graph)
3749
4
        return graph;
3750
7
    }
3751
3
  return 0;
3752
7
}
3753
3754
static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3755
6.05k
{
3756
6.05k
  int i;
3757
6.10k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++43
)
3758
43
    if (graph_prep->sub_preps[i])
3759
42
    {
3760
42
      if (graph_prep->sub_preps[i]->symbolic_graph->pair)
3761
4
        graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
3762
42
    }
3763
6.05k
}
3764
3765
static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3766
6.10k
{
3767
6.10k
  assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
3768
6.10k
  int i;
3769
58.2k
  for (i = 0; i < graph_prep->exec_symbol_info_size; 
i++52.0k
)
3770
52.0k
  {
3771
52.0k
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags))
3772
9
      continue;
3773
52.0k
    if (graph_exec_arena->graph_execs[i].graph && 
graph_prep->exec_symbol_info[i].pair_ref31.8k
)
3774
15.7k
    {
3775
15.7k
      ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
3776
15.7k
        .d = graph_prep->exec_symbol_info[i].pair_ref - 1,
3777
15.7k
        .graph = graph_prep->symbolic_graph->pair ? 
graph_prep->symbolic_graph->pair4
:
graph_prep->symbolic_graph15.7k
,
3778
15.7k
      });
3779
15.7k
      if (pair_exec.d >= 0)
3780
456
        ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
3781
15.7k
    }
3782
52.0k
  }
3783
6.15k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
3784
50
    if (graph_prep->sub_preps[i])
3785
49
      _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
3786
6.10k
}
3787
3788
static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3789
6.10k
{
3790
6.10k
  int i;
3791
6.10k
  if (graph_prep->dup_breakpoints)
3792
2
  {
3793
    // Strip the const modifier only possible because it is a sub-graph.
3794
2
    ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
3795
4
    for (i = 0; i < graph_prep->dup_breakpoints->rnum; 
i++2
)
3796
2
      ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i));
3797
2
    ccv_array_free(graph_prep->dup_breakpoints);
3798
2
    graph_prep->dup_breakpoints = 0;
3799
2
    graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3800
    // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
3801
2
    memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
3802
    // Since exec_symbol_info changed, create a new visit object.
3803
2
    assert(symbolic_graph->sources);
3804
2
    assert(symbolic_graph->destinations);
3805
2
    ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0);
3806
2
    const int source_size = symbolic_graph->sources->rnum;
3807
2
    ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0);
3808
2
    const int destination_size = symbolic_graph->destinations->rnum;
3809
4
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new2
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
3810
0
    ccv_nnc_graph_visit_free(graph_prep->visit);
3811
4
    graph_prep->visit = visit;
3812
4
    assert(graph_prep->p);
3813
2
    ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
3814
2
  }
3815
31.8k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
3816
31.8k
    for (i = 0; i < node->graph_ref_size; 
i++49
)
3817
49
    {
3818
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3819
49
      if (graph_ref >= 0)
3820
49
        _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
3821
49
    }
3822
31.8k
  } ccv_nnc_graph_visit_endfor
3823
6.10k
}
3824
3825
const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
3826
3827
void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
3828
6.05k
{
3829
6.05k
  assert(graph_ref);
3830
6.05k
  assert(tensor_arena_ref);
3831
6.05k
  assert(graph_exec_arena_ref);
3832
6.05k
  int i;
3833
  // Cannot bind the multi-view.
3834
53.6k
  for (i = 0; i < tensor_bind_size; 
i++47.5k
)
3835
47.5k
  {
3836
47.5k
    assert(tensor_binds[i].tensor);
3837
47.5k
    assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor));
3838
47.5k
  }
3839
6.05k
  ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
3840
6.05k
  _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
3841
6.05k
  ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
3842
6.05k
  _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
3843
6.05k
  *tensor_arena_ref = tensor_arena;
3844
  // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
3845
6.05k
  _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
3846
  // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
3847
6.05k
  _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
3848
6.05k
  *graph_ref = graph_prep->graph;
3849
6.05k
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
3850
6.05k
  _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
3851
6.05k
  _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
3852
6.05k
  *graph_exec_arena_ref = graph_exec_arena;
3853
6.05k
  _ccv_nnc_symbolic_graph_prep_free(graph_prep);
3854
6.05k
}
3855
3856
static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
3857
6.10k
{
3858
  // Buffers are inherited from above, no need to dealloc.
3859
6.10k
  int i;
3860
6.15k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
3861
50
    if (tensor_arena->sub_arenas[i])
3862
49
      _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
3863
6.16k
  for (i = 0; i < tensor_arena->m_tensor_idx->rnum; 
i++61
)
3864
61
  {
3865
61
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i));
3866
61
    assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv));
3867
61
    ccv_nnc_tensor_multiview_free(*mv);
3868
61
  }
3869
6.10k
  ccv_array_free(tensor_arena->tensor_metadata);
3870
6.10k
  ccv_array_free(tensor_arena->m_tensor_idx);
3871
6.10k
  if (tensor_arena->pb_vt_tensors)
3872
70
    ccfree(tensor_arena->pb_vt_tensors);
3873
6.10k
  if (tensor_arena->vt_alias_r_refs_p)
3874
70
    ccfree(tensor_arena->vt_alias_r_refs_p);
3875
6.10k
  if (tensor_arena->vt_sizes)
3876
9
    ccfree(tensor_arena->vt_sizes);
3877
6.10k
  ccfree(tensor_arena);
3878
6.10k
}
3879
3880
void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
3881
83.0k
{
3882
83.0k
  assert(tensor_arena->graph_ref == (intptr_t)symbol.graph);
3883
83.0k
  assert(symbol.d < tensor_arena->vt_tensor_size);
3884
83.0k
  assert(symbol.d >= 0);
3885
  // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
3886
83.0k
  int i;
3887
83.0k
  if (!tensor_arena->pb_vt_tensors)
3888
70
  {
3889
70
    tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
3890
7.62k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.55k
)
3891
7.55k
      if (tensor_arena->vt_tensors[i])
3892
6.25k
        tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
3893
70
  }
3894
83.0k
  if (!tensor_arena->vt_alias_r_refs_p)
3895
70
  {
3896
70
    tensor_arena->vt_alias_r_refs_p = (int*)cccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
3897
70
    tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
3898
7.62k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.55k
)
3899
7.55k
      if (tensor_arena->vt_alias_refs[i])
3900
571
      {
3901
571
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
3902
571
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
3903
571
        ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
3904
571
      }
3905
70
    int refp = 0;
3906
7.55k
    for (i = 1; i < tensor_arena->vt_tensor_size; 
i++7.48k
) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
3907
7.48k
      if (tensor_arena->vt_alias_r_refs_p[i])
3908
565
        refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
3909
6.91k
      else
3910
6.91k
        tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
3911
7.05k
    for (i = refp; i < tensor_arena->vt_tensor_size; 
i++6.98k
)
3912
6.98k
      tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
3913
7.62k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.55k
)
3914
7.55k
      if (tensor_arena->vt_alias_refs[i])
3915
571
      {
3916
571
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
3917
571
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
3918
571
        const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
3919
571
        assert(pos >= 0);
3920
571
        tensor_arena->vt_alias_r_refs[pos] = i;
3921
571
      }
3922
70
  }
3923
83.0k
  const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? 
tensor_arena->vt_alias_refs[symbol.d] - 11
:
symbol.d83.0k
;
3924
83.0k
  if (CCV_IS_TENSOR_VIEW(tensor))
3925
0
  {
3926
0
    assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
3927
0
    assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->inc) == 0 &&
3928
0
          ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||
3929
0
        ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->inc) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info));
3930
0
  } else
3931
83.0k
    { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)); }
3932
83.0k
  if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d]))
3933
0
    { assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0); }
3934
83.0k
  tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
3935
83.0k
  if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
3936
12.7k
    
for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; 11.7k
i < tensor_arena->vt_tensor_size;
i++1.00k
)
3937
12.7k
    {
3938
12.7k
      const int d = tensor_arena->vt_alias_r_refs[i];
3939
12.7k
      if (d < 0 || 
symbol_d + 1 != tensor_arena->vt_alias_refs[d]2.66k
) // Doesn't match, reached the end of it.
3940
11.7k
        break;
3941
1.00k
      ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
3942
1.00k
      if (CCV_IS_TENSOR_VIEW(d_tensor))
3943
2
        d_tensor->data.u8 = tensor->data.u8 + ((ccv_nnc_tensor_view_t*)d_tensor)->off;
3944
1.00k
      else
3945
1.00k
        d_tensor->data.u8 = tensor->data.u8;
3946
1.00k
    }
3947
83.0k
}
3948
3949
void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
3950
14.5k
{
3951
14.5k
  if (!tensor_arena->pb_vt_tensors)
3952
35
    return;
3953
14.4k
  int i;
3954
478k
  for (i = 0; i < tensor_arena->vt_tensor_size; 
i++464k
)
3955
464k
    if (tensor_arena->vt_tensors[i])
3956
291k
      tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
3957
14.4k
}
3958
3959
uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
3960
2
{
3961
2
  uint64_t total_size = 0;
3962
2
  int i;
3963
36
  for (i = 0; i < tensor_arena->buffer_size; 
i++34
)
3964
34
    total_size += tensor_arena->buffers[i].size;
3965
2
  return total_size;
3966
2
}
3967
3968
static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
3969
0
{
3970
0
  int i;
3971
0
  if (mv->it)
3972
0
    mv->it->info = params;
3973
0
  for (i = 0; i < mv->repeat + mv->kind; i++)
3974
0
  {
3975
0
    ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i];
3976
0
    if (CCV_IS_TENSOR_MULTIVIEW(tensor))
3977
0
      _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
3978
0
    else
3979
0
      tensor->info = params;
3980
0
  }
3981
0
}
3982
3983
int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
3984
2.20k
{
3985
2.20k
  int i;
3986
2.20k
  assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size);
3987
2.20k
  if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
3988
9
  {
3989
9
    tensor_arena->vt_sizes = (size_t*)ccmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
3990
1.40k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++1.39k
)
3991
1.39k
      if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]1.10k
)
3992
838
      {
3993
838
        ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
3994
838
        if (CCV_IS_TENSOR_MULTIVIEW(tensor))
3995
0
        {
3996
0
          ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
3997
0
          while (CCV_IS_TENSOR_MULTIVIEW(mv))
3998
0
            mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)[0]);
3999
0
          tensor = (ccv_nnc_tensor_t*)mv;
4000
0
        }
4001
838
        tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4002
838
      }
4003
9
  }
4004
2.20k
  int flag = 0;
4005
19.2k
  for (i = 0; !flag && 
i < tensor_arena->vt_tensor_size19.2k
;
i++17.0k
)
4006
17.0k
    if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]14.6k
)
4007
13.6k
    {
4008
13.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4009
13.6k
      flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(symbol_info->info));
4010
13.6k
    }
4011
2.20k
  if (flag)
4012
4
    return -1;
4013
19.2k
  
for (i = 0; 2.20k
i < tensor_arena->vt_tensor_size;
i++17.0k
)
4014
17.0k
    if (tensor_arena->vt_tensors[i])
4015
14.6k
    {
4016
14.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4017
14.6k
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4018
14.6k
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4019
0
      {
4020
0
        assert(!tensor_arena->vt_alias_refs[i]);
4021
0
        _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4022
14.6k
      } else if (!tensor_arena->vt_alias_refs[i])
4023
13.6k
        tensor->info = symbol_info->info;
4024
1.00k
      else {
4025
1.00k
        off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->inc, symbol_info->ofs);
4026
1.00k
        tensor->info = symbol_info->info;
4027
1.00k
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4028
1.00k
        tensor->data.u8 = tensor_arena->vt_tensors[alias_ref]->data.u8 + off;
4029
1.00k
        if (CCV_IS_TENSOR_VIEW(tensor))
4030
0
          ((ccv_nnc_tensor_view_t*)tensor)->off = off;
4031
1.00k
      }
4032
14.6k
    }
4033
  // Should handle sub_tensor_arena, don't do that at the moment.
4034
2.20k
  assert(!graph->sub_graphs);
4035
2.20k
  return 0;
4036
2.20k
}
4037
4038
void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4039
2.20k
{
4040
2.20k
  assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size);
4041
2.20k
  int i;
4042
9.02k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++6.82k
)
4043
6.82k
  {
4044
6.82k
    const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4045
6.82k
    if (graph_exec.d < 0)
4046
2.41k
      continue;
4047
4.41k
    const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4048
4.41k
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
4049
4.41k
    ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4050
4.41k
    if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4051
4.41k
    {
4052
4.41k
      new_cmd.backend = existing_cmd.backend;
4053
4.41k
      new_cmd.algorithm = existing_cmd.algorithm;
4054
4.41k
    }
4055
4.41k
    ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4056
4.41k
  }
4057
2.20k
}
4058
4059
void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4060
6.26k
{
4061
6.26k
  int i;
4062
22.3k
  for (i = 0; i < tensor_arena->buffer_size; 
i++16.1k
)
4063
16.1k
  {
4064
16.1k
    if (!tensor_arena->buffers[i].ptr)
4065
248
      continue;
4066
15.8k
    const int buffer_type = tensor_arena->buffers[i].type;;
4067
15.8k
    const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
4068
15.8k
#ifdef HAVE_CUDA
4069
15.8k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
4070
15.8k
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
4071
2.44k
    {
4072
2.44k
      if (tensor_arena->allocator.isa && 
tensor_arena->allocator.isa->free268
)
4073
268
        tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4074
2.18k
      else
4075
2.18k
        cufree(device_id, tensor_arena->buffers[i].ptr);
4076
13.4k
    } else {
4077
13.4k
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4078
13.4k
      if (tensor_arena->buffers[i].pin_mem)
4079
11
        cuhostfree(tensor_arena->buffers[i].ptr);
4080
13.3k
      else
4081
13.3k
        ccfree(tensor_arena->buffers[i].ptr);
4082
13.4k
    }
4083
#else
4084
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4085
    ccfree(tensor_arena->buffers[i].ptr);
4086
#endif
4087
15.8k
    tensor_arena->buffers[i].ptr = 0;
4088
15.8k
  }
4089
6.26k
}
4090
4091
void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4092
6.05k
{
4093
6.05k
  ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4094
6.05k
  _ccv_nnc_tensor_arena_free(tensor_arena);
4095
6.05k
}
4096
4097
void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4098
6.10k
{
4099
6.10k
  int i;
4100
6.15k
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++50
)
4101
50
    if (graph_exec_arena->sub_arenas[i])
4102
49
      ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4103
6.10k
  ccfree(graph_exec_arena);
4104
6.10k
}