Coverage Report

Created: 2021-09-21 23:33

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_symbolic_graph_compile.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#ifdef HAVE_CUDA
6
#include "gpu/ccv_nnc_compat.h"
7
#endif
8
#include "_ccv_nnc_graph.h"
9
#include "_ccv_nnc_symbolic_graph.h"
10
11
// MARK - Level-3 API
12
13
typedef struct {
14
  int flags;
15
  int type;
16
  int pin_mem; // This memory need to be pinned.
17
  int ref; // Reference to another tensor block. Start with 1.
18
  int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
19
  int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
20
  int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
21
  ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
22
  uint64_t size; // The size of the tensor expected.
23
  int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
24
  ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
25
  ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
26
  ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
27
} ccv_nnc_tensor_block_t; // Tensor Arena Block
28
29
4.85M
#define IS_PRIMARY_COMPANION(idx, block) ((idx) < (uint32_t)((block).companion_ref - 1))
30
31
enum {
32
  UNASSIGNED = 0x1,
33
  ALIAS = 0x2,
34
  READ_ONLY = 0x4,
35
  WRITE_ONLY = 0x8,
36
  READ_WRITE = 0xc,
37
  ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
38
  UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
39
  UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
40
};
41
42
#define TENSOR_EXPECT_ORDINARY(t) ((t.flags & 0x3) == 0)
43
#define TENSOR_EXPECT_SET_ORDINARY(t) (t.flags = (t.flags & ~0x3))
44
5.89M
#define TENSOR_EXPECT_UNASSIGNED(t) ((t.flags & 0x3) == UNASSIGNED)
45
6.18k
#define TENSOR_EXPECT_SET_UNASSIGNED(t) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
46
3
#define TENSOR_EXPECT_UNSET_UNASSIGNED(t) (t.flags = (t.flags & ~0x1))
47
10.2M
#define TENSOR_EXPECT_ALIAS(t) ((t.flags & 0x3) == ALIAS)
48
9.60M
#define TENSOR_EXPECT_COMPUTABLE(t) (
!4.91M
TENSOR_EXPECT_ALIAS4.91M
(t) &&
!4.69M
TENSOR_EXPECT_UNASSIGNED4.69M
(t))
49
29.3k
#define TENSOR_READ_WRITE(t) (t.flags & 0xc)
50
6.26k
#define TENSOR_SET_READ_WRITE(t, rw) (t.flags = ((t.flags & ~0xc) | rw))
51
95
#define TENSOR_SET_ANONYMOUS(t) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
52
#define TENSOR_IS_ANONYMOUS(t) (t.flags & ANONYMOUS)
53
180
#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
54
19.0k
#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t) (t.flags & UNFOLDABLE_AS_INPUT)
55
116
#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
56
12.7k
#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t) (t.flags & UNFOLDABLE_AS_OUTPUT)
57
58
122k
#define TENSOR_REQUIRE_INIT(flags) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || 
((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)122k
)
59
60
// Holds additional information about the exe nodes.
61
typedef struct {
62
  int flags;
63
} ccv_nnc_graph_exec_flag_t;
64
65
enum {
66
  CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
67
};
68
69
typedef struct {
70
  int index;
71
  int oc;
72
  int type;
73
  uint64_t size;
74
} ccv_nnc_tensor_opt_t;
75
76
// We first sort the same type together (because they won't be reused at all.
77
// And then we sort by size, after that, sort by oc.
78
250k
#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
79
250k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)
80
#undef more_than
81
82
// If b has items overlap with a, a is still after b (inclusive).
83
static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
84
0
{
85
0
  assert(a);
86
0
  assert(b);
87
0
  int x, y;
88
0
  for (x = 0; x < b->rnum; x++)
89
0
  {
90
0
    const int p = *(int*)ccv_array_get(b, x);
91
0
    int flag = 0;
92
0
    // In extreme cases where a is a superset of b, then a is still after b, we are good.
93
0
    for (y = 0; !flag && y < a->rnum; y++)
94
0
    {
95
0
      const int q = *(int*)ccv_array_get(a, y);
96
0
      flag = (p == q);
97
0
    }
98
0
    if (!flag)
99
0
      for (y = 0; y < a->rnum; y++)
100
0
      {
101
0
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y), p);
102
0
        if (!cell.i32 || cell.i32[0] == 0)
103
0
          return 0;
104
0
      }
105
0
  }
106
0
  // If b->rnum == 0, a is after b for sure.
107
0
  // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
108
0
  // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
109
0
  return (a->rnum > 0 || b->rnum == 0);
110
0
}
111
112
static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
113
1.40M
{
114
1.40M
  assert(a);
115
1.40M
  assert(b);
116
1.40M
  int x, y, max_hop = 0;
117
1.48M
  for (x = 0; x < a->rnum; 
x++80.7k
)
118
1.48M
    
for (y = 0; 1.40M
y < b->rnum;
y++80.8k
)
119
1.40M
    {
120
1.40M
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x), *(int*)ccv_array_get(b, y));
121
1.40M
      if (!cell.i32 || 
cell.i32[0] == 080.8k
)
122
1.32M
        return 0;
123
80.8k
      max_hop = ccv_max(cell.i32[0], max_hop);
124
80.8k
    }
125
1.40M
  // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
126
1.40M
  // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
127
1.40M
  
return max_hop75.6k
;
128
1.40M
}
129
130
// If every a's head is deterministically after b's tail
131
static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
132
1.40M
{
133
1.40M
  return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
134
1.40M
}
135
136
typedef struct {
137
  ccv_array_t** alloc_dep;
138
  int vt_block_size;
139
  int buffer_size;
140
  int block_size;
141
  int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
142
  struct {
143
    int type; // The type from tensor blocks.
144
    int pin_mem; // Whether this is pinned memory.
145
    int flags; // The flags (currently for READ_ONLY or not).
146
    uint64_t size; // The size of the buffer allocated.
147
    int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
148
    ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
149
  }* buffers;
150
  struct {
151
    int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
152
    int block_ref; // A reference to which block in the given tensor_block to use.
153
    uint64_t offset; // The offset of this block.
154
  }* blocks;
155
} ccv_nnc_tensor_alloc_prep_t;
156
157
typedef struct ccv_nnc_symbolic_graph_prep_s {
158
  int flags;
159
  int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
160
  int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
161
  int exec_idx;
162
  int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
163
  int tensor_symbol_info_size;
164
  int exec_symbol_info_size;
165
  int tensor_block_size;
166
  int sub_prep_size;
167
  ccv_nnc_tensor_block_t* tensor_blocks;
168
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
169
  ccv_nnc_graph_exec_flag_t* exec_flags;
170
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
171
  int* dup_tensor_block_ref;
172
  ccv_nnc_graph_visit_t* visit;
173
  ccv_nnc_tensor_alloc_prep_t* alloc_prep;
174
  struct ccv_nnc_symbolic_graph_prep_s* p;
175
  struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
176
  // Structures that don't require to be freed after deallocation.
177
  const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
178
  ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
179
  ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
180
  ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
181
} ccv_nnc_symbolic_graph_prep_t;
182
183
typedef struct {
184
  int oc;
185
  ccv_array_t* itf;
186
} ccv_nnc_tensor_block_adjacent_t;
187
188
static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
189
6.09k
{
190
6.09k
  // Compute how many dis-continuous buffers are needed.
191
6.09k
  // We prefer to have several dis-continuous buffers instead of one big buffer because
192
6.09k
  // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
193
6.09k
  // to fully utilize memory.
194
6.09k
  int i, j, k;
195
6.09k
  ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloc(tensor_block_size, sizeof(ccv_array_t*));
196
6.09k
  int allocable_tensor_size = 0, available_tensor_size = 0;
197
100k
  for (i = 0; i < tensor_block_size; 
i++93.9k
)
198
93.9k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
199
93.9k
    {
200
31.8k
      // Tensors that we need the header info.
201
31.8k
      ++available_tensor_size;
202
31.8k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
203
31.8k
        // Tensors that we actually need to allocate (exclude the alias).
204
31.8k
        
++allocable_tensor_size29.0k
;
205
31.8k
    }
206
6.09k
  ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
207
6.09k
  ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
208
6.09k
  ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
209
6.09k
  // Overlap count.
210
100k
  for (i = 0; i < tensor_block_size; 
i++93.9k
)
211
93.9k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
212
2.01M
      
for (j = i + 1; 29.0k
j < tensor_block_size;
j++1.99M
)
213
1.99M
        if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j]))
214
1.99M
        {
215
700k
          // Check to see if they interfere (default to yes).
216
700k
          // If any of the i's head is deterministically later than j's tail
217
700k
          // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
218
700k
          const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
219
700k
          if (i_hop_j > 0)
220
238
          {
221
238
            ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
222
238
            ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
223
238
          }
224
700k
          const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
225
700k
          if (j_hop_i > 0)
226
75.3k
          {
227
75.3k
            ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
228
75.3k
            ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
229
75.3k
          }
230
700k
          // It cannot be that both i can hop to j can j can hop to i.
231
700k
          assert(!(i_hop_j > 0 && j_hop_i > 0));
232
700k
          if (!i_hop_j && 
!j_hop_i700k
&&
tensor_blocks[i].type == tensor_blocks[j].type624k
)
233
150k
          {
234
150k
            if (!adj[i].itf)
235
4.49k
              adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
236
150k
            ccv_array_push(adj[i].itf, &j);
237
150k
            ++adj[i].oc;
238
150k
            if (!adj[j].itf)
239
24.2k
              adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
240
150k
            ccv_array_push(adj[j].itf, &i);
241
150k
            ++adj[j].oc;
242
150k
          }
243
700k
        }
244
6.09k
  int* const buf = (int*)ccmalloc(sizeof(int) * tensor_block_size);
245
6.09k
  int* const assigned = (int*)cccalloc(tensor_block_size, sizeof(int));
246
6.09k
  uint64_t* const allocated_offset = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
247
6.09k
  uint64_t* const allocated_size = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
248
6.09k
  int num_assigned = 0; 
249
6.09k
  // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
250
6.09k
  // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
251
6.09k
  // The first channel denotes the bytes available for allocation,
252
6.09k
  // the second channel denotes the offset available for the allocation,
253
6.09k
  ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
254
6.09k
  ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
255
35.0k
  for (j = 0; j < allocable_tensor_size;)
256
28.9k
  {
257
28.9k
    // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
258
28.9k
    uint64_t max_size = 0;
259
28.9k
    ccv_array_clear(opt);
260
28.9k
    int current_type = 0; // Deal with one type at a time.
261
4.48M
    for (i = 0; i < tensor_block_size; 
i++4.45M
)
262
4.45M
      if (tensor_blocks[i].size >= max_size &&
263
4.45M
        
TENSOR_EXPECT_COMPUTABLE2.24M
(tensor_blocks[i]) &&
!assigned[i]1.03M
&&
264
4.45M
        
IS_PRIMARY_COMPANION398k
(i, tensor_blocks[i]) &&
265
4.45M
        
(398k
!current_type398k
||
tensor_blocks[i].type == current_type369k
))
266
134k
      {
267
134k
        ccv_nnc_tensor_opt_t a = {
268
134k
          .size = tensor_blocks[i].size,
269
134k
          .index = i,
270
134k
          .oc = adj[i].oc,
271
134k
          .type = tensor_blocks[i].type,
272
134k
        };
273
134k
        assert(a.type);
274
134k
        current_type = a.type; // Now we now the primary type we should deal with.
275
134k
        if (tensor_blocks[i].companion_ref)
276
36
        {
277
36
          const int companion_ref = tensor_blocks[i].companion_ref - 1;
278
36
          a.size = ccv_max(a.size, tensor_blocks[companion_ref].size);
279
36
          a.oc += adj[companion_ref].oc;
280
36
        }
281
134k
        // In case we have a tie, take them all in the array.
282
134k
        if (a.size > max_size)
283
33.8k
          ccv_array_clear(opt), max_size = a.size;
284
134k
        ccv_array_push(opt, &a);
285
134k
      }
286
28.9k
    assert(opt->rnum > 0);
287
28.9k
    // Order opt array by the oc because type and size should be equal at this point.
288
28.9k
    _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
289
28.9k
    // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
290
28.9k
    int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep->rows * 3;
291
28.9k
    uint64_t min_val[2] = {
292
28.9k
      0, 0
293
28.9k
    };
294
108k
    for (i = 0; i < opt->rnum; 
i++79.3k
)
295
92.3k
    {
296
92.3k
      ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
297
92.3k
      // Now, determine the order between a and c. After this, we can always check whether y
298
92.3k
      // can hop to the earliest one and if the latest one can hop to x.
299
92.3k
      // The earliest one will be called p and the latest one will be called q.
300
92.3k
      int p = a.index;
301
92.3k
      int q = a.index;
302
92.3k
      if (tensor_blocks[a.index].companion_ref)
303
33
      {
304
33
        const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
305
33
        const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
306
33
        if (b_hop_p.i32 && 
b_hop_p.i32[0] > 02
)
307
2
          p = companion_ref;
308
31
        else {
309
31
          const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
310
31
          if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
311
31
            q = companion_ref;
312
0
          else { // Otherwise, b is in between p and q.
313
0
            const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
314
0
            const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
315
0
            assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0);
316
0
          }
317
31
        }
318
33
      }
319
92.3k
      assert(tensor_blocks[q].type == tensor_blocks[p].type);
320
92.3k
      const int type = tensor_blocks[p].type;
321
92.3k
      // y is always earlier than x, but this is hard to assert now.
322
92.3k
      // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
323
92.3k
      // Thus, the hop between y and x (through a) should be smallest ones.
324
92.3k
      // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
325
92.3k
      // out of q. For these nodes, we try to verify whether they form a connection (by checking against
326
92.3k
      // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
327
92.3k
      int y_size = 0;
328
92.3k
      int* const y_buf = buf;
329
279k
#define for_block(y, val) do { \
330
279k
        if (((int*)val)[0] > 0 && assigned[y] && 
tensor_blocks[y].type == type121k
) \
331
279k
          
y_buf[y_size++] = y + 1121k
; \
332
279k
      } while(0)
333
92.3k
      ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
334
92.3k
      if (y_vector)
335
279k
        
CCV_SPARSE_VECTOR_FOREACH47.5k
(tensor_dt, y_vector, for_block);
336
92.3k
#undef for_block
337
92.3k
      assert(y_size <= tensor_block_size);
338
92.3k
      int x_size = 0;
339
92.3k
      int* const x_buf = buf + y_size;
340
256k
#define for_block(x, val) do { \
341
256k
        if (((int*)val)[0] > 0 && assigned[x] && 
tensor_blocks[x].type == type88.0k
) \
342
256k
          
x_buf[x_size++] = x + 187.8k
; \
343
256k
      } while(0)
344
92.3k
      ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
345
92.3k
      if (x_vector)
346
256k
        
CCV_SPARSE_VECTOR_FOREACH47.3k
(tensor_df, x_vector, for_block);
347
92.3k
#undef for_block
348
92.3k
      assert(y_size + x_size <= tensor_block_size);
349
92.3k
      int x, y;
350
213k
      for (y = 0; y < y_size; 
y++121k
)
351
121k
      {
352
121k
        const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y], tensor_block_size + 1);
353
121k
        if (val.u64 && 
val.u64[0] >= a.size18.4k
)
354
10.2k
        {
355
10.2k
          const ccv_numeric_data_t y_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, y_buf[y] - 1);
356
10.2k
          assert(y_hop_p.i32 && y_hop_p.i32[0] > 0);
357
10.2k
          const int hop = exec_dep->rows + y_hop_p.i32[0];
358
10.2k
          if (hop < min_hop)
359
6.76k
            min_y = y_buf[y], min_x = tensor_block_size + 1, min_hop = hop,
360
6.76k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
361
10.2k
        }
362
121k
      }
363
180k
      
for (x = 0; 92.3k
x < x_size;
x++87.8k
)
364
87.8k
      {
365
87.8k
        const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x]);
366
87.8k
        if (val.u64 && 
val.u64[0] >= a.size12.4k
)
367
6.16k
        {
368
6.16k
          const ccv_numeric_data_t q_hop_x = ccv_get_sparse_matrix_cell(tensor_dt, x_buf[x] - 1, q);
369
6.16k
          assert(q_hop_x.i32 && q_hop_x.i32[0] > 0);
370
6.16k
          const int hop = exec_dep->rows + q_hop_x.i32[0];
371
6.16k
          if (hop < min_hop)
372
4.85k
            min_y = 0, min_x = x_buf[x], min_hop = hop,
373
4.85k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
374
6.16k
        }
375
87.8k
      }
376
213k
      
for (y = 0; 92.3k
y < y_size;
y++121k
)
377
121k
      {
378
121k
        ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y]);
379
121k
        if (y_vector)
380
467k
          
for (x = 0; 121k
x < x_size;
x++346k
)
381
346k
          {
382
346k
            const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x]);
383
346k
            if (val.u64 && 
val.u64[0] >= a.size3.72k
)
384
3.14k
            {
385
3.14k
              const ccv_numeric_data_t y_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, y_buf[y] - 1);
386
3.14k
              const ccv_numeric_data_t q_hop_x = ccv_get_sparse_matrix_cell(tensor_dt, x_buf[x] - 1, q);
387
3.14k
              assert(y_hop_p.i32 && y_hop_p.i32[0] > 0);
388
3.14k
              assert(q_hop_x.i32 && q_hop_x.i32[0] > 0);
389
3.14k
              const int hop = y_hop_p.i32[0] + q_hop_x.i32[0];
390
3.14k
              if (hop < min_hop)
391
2.59k
                min_y = y_buf[y], min_x = x_buf[x], min_hop = hop,
392
2.59k
                  min_val[0] = val.u64[0], min_val[1] = val.u64[1];
393
3.14k
            }
394
346k
          }
395
121k
      }
396
92.3k
      // If I found a place, stop, and exit.
397
92.3k
      if (min_y > 0 || 
min_x < tensor_block_size + 183.8k
)
398
12.9k
      {
399
12.9k
        min_i = i;
400
12.9k
        break;
401
12.9k
      }
402
92.3k
    }
403
28.9k
    // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
404
28.9k
    // and default to largest size available.
405
28.9k
    ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i));
406
28.9k
    if (min_i == -1)
407
15.9k
    {
408
15.9k
      allocated_size[num_assigned] = a.size;
409
15.9k
      ++num_assigned;
410
15.9k
    }
411
28.9k
    int assign_group = num_assigned;
412
28.9k
    if (min_y > 0)
413
8.50k
    {
414
8.50k
      assign_group = assigned[min_y - 1];
415
8.50k
      // The y and x should belong to the same assigned group.
416
8.50k
      assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group);
417
20.4k
    } else if (min_x < tensor_block_size + 1)
418
4.49k
      assign_group = assigned[min_x - 1];
419
28.9k
    // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
420
28.9k
    if (min_y != 0 || 
min_x != tensor_block_size + 120.4k
)
421
12.9k
    {
422
12.9k
      uint64_t val[2] = {
423
12.9k
        min_val[0], min_val[1]
424
12.9k
      };
425
12.9k
      assert(val[0] >= a.size);
426
12.9k
      val[0] -= a.size;
427
12.9k
      val[1] = val[1] + a.size; // Move the offset to the next one.
428
12.9k
      ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
429
12.9k
    }
430
28.9k
    int strings[3];
431
28.9k
    strings[0] = a.index + 1;
432
28.9k
    int string_size = 1;
433
28.9k
    // Assign out designated companion if it exist.
434
28.9k
    if (tensor_blocks[a.index].companion_ref)
435
20
    {
436
20
      const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
437
20
      assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type);
438
20
      const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
439
20
      if (b_hop_p.i32 && 
b_hop_p.i32[0] > 02
)
440
2
      {
441
4
        for (i = 0; i < string_size; 
i++2
)
442
2
          strings[i + 1] = strings[i];
443
2
        strings[0] = companion_ref + 1;
444
18
      } else {
445
18
        const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
446
18
        if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
447
18
          strings[string_size] = companion_ref + 1;
448
0
        else {
449
0
          // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
450
0
          assert(string_size == 2);
451
0
          strings[2] = strings[1];
452
0
          strings[1] = companion_ref + 1;
453
0
        }
454
18
      }
455
20
      ++string_size;
456
20
    }
457
28.9k
    // Assign out and update oc.
458
58.0k
    
for (i = 0; 28.9k
i < string_size;
i++29.0k
)
459
29.0k
    {
460
29.0k
      const int index = strings[i] - 1;
461
29.0k
      // Assign out the selected one.
462
29.0k
      assigned[index] = assign_group;
463
29.0k
      // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
464
29.0k
      allocated_offset[index] = min_val[1];
465
29.0k
      if (adj[index].itf)
466
329k
        
for (k = 0; 28.7k
k < adj[index].itf->rnum;
k++300k
)
467
300k
        {
468
300k
          const int d = *(int*)ccv_array_get(adj[index].itf, k);
469
300k
          if (!assigned[d] && 
TENSOR_EXPECT_COMPUTABLE150k
(tensor_blocks[d]))
470
300k
            
--adj[d].oc150k
;
471
300k
        }
472
29.0k
    }
473
28.9k
    uint64_t val[2] = {
474
28.9k
      a.size, min_val[1]
475
28.9k
    };
476
28.9k
    uint64_t consumed_size = 0;
477
28.9k
    // Go over from min_y to string_size (excluding min_x).
478
28.9k
    for (i = 0; i < string_size; 
i++0
)
479
28.9k
    {
480
28.9k
      const uint64_t size = tensor_blocks[strings[i] - 1].size;
481
28.9k
      assert(size <= a.size);
482
28.9k
      // Update consumed size if it is bigger than "size".
483
28.9k
      if (size > consumed_size)
484
28.9k
      {
485
28.9k
        val[0] = size - consumed_size;
486
28.9k
        ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
487
28.9k
        consumed_size = size;
488
28.9k
        val[1] = min_val[1] + consumed_size;
489
28.9k
      }
490
28.9k
      // If it consumed all the flow, break out.
491
28.9k
      if (consumed_size == a.size)
492
28.9k
        break;
493
28.9k
    }
494
58.0k
    
for (i = 0; 28.9k
i < string_size;
i++29.0k
)
495
29.0k
    {
496
29.0k
      const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
497
29.0k
      uint64_t val[2] = {
498
29.0k
        i_size, min_val[1]
499
29.0k
      };
500
29.0k
      uint64_t consumed_size = 0;
501
29.0k
      for (k = i + 1; k < string_size; 
k++0
)
502
20
      {
503
20
        const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size);
504
20
        // Update consumed size if it is bigger than "size".
505
20
        if (size > consumed_size)
506
20
        {
507
20
          val[0] = size - consumed_size;
508
20
          ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
509
20
          consumed_size = size;
510
20
          val[1] = min_val[1] + consumed_size;
511
20
        }
512
20
        // If it consumed all the flow, break out.
513
20
        if (consumed_size == i_size)
514
20
          break;
515
20
      }
516
29.0k
      val[0] = i_size - consumed_size;
517
29.0k
      // Still have residual, flow it to min_x.
518
29.0k
      if (val[0] > 0)
519
28.9k
        ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
520
29.0k
    }
521
28.9k
    j += string_size;
522
28.9k
  }
523
6.09k
  ccfree(buf);
524
6.09k
  ccv_array_free(opt);
525
6.09k
  ccv_matrix_free(tensor_df);
526
6.09k
  ccv_matrix_free(tensor_dt);
527
58.0k
#define for_block(y, x, val) do { \
528
58.0k
    if (((uint64_t*)val)[0] > 0 && 
y > 045.9k
&&
x < tensor_block_size + 129.7k
) \
529
58.0k
    { \
530
13.1k
      if (!alloc_dep[x - 1]) \
531
13.1k
        
alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0)12.8k
; \
532
13.1k
      ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
533
13.1k
    } \
534
58.0k
  } while (0)
535
58.0k
  
CCV_SPARSE_FOREACH6.09k
(alloc, for_block);
536
6.09k
#undef for_block
537
6.09k
  ccv_matrix_free(alloc);
538
100k
  for (i = 0; i < tensor_block_size; 
i++93.9k
)
539
93.9k
    if (adj[i].itf)
540
28.7k
      ccv_array_free(adj[i].itf);
541
6.09k
  ccfree(adj);
542
6.09k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
543
6.09k
  alloc_prep->alloc_dep = alloc_dep;
544
6.09k
  alloc_prep->vt_block_size = tensor_block_size;
545
6.09k
  alloc_prep->buffer_size = num_assigned;
546
6.09k
  alloc_prep->block_size = available_tensor_size;
547
6.09k
  alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
548
6.09k
  alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
549
6.09k
  alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
550
6.09k
  memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
551
22.0k
  for (i = 0; i < num_assigned; 
i++15.9k
)
552
15.9k
    alloc_prep->buffers[i].size = allocated_size[i];
553
6.09k
  ccfree(allocated_size);
554
6.09k
  j = 0;
555
6.09k
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
556
100k
  for (i = 0; i < tensor_block_size; 
i++93.9k
)
557
93.9k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
558
93.9k
    {
559
31.8k
      alloc_prep->blocks[j].block_ref = i;
560
31.8k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
561
31.8k
      {
562
29.0k
        alloc_prep->vt_blocks[i] = j;
563
29.0k
        // Also, set its allocations.
564
29.0k
        assert(assigned[i] > 0);
565
29.0k
        const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
566
29.0k
        alloc_prep->blocks[j].offset = allocated_offset[i];
567
29.0k
        if (!alloc_prep->buffers[buffer_ref].type)
568
15.9k
          alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
569
29.0k
        alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || 
tensor_blocks[i].pin_mem29.0k
;
570
29.0k
        alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i]);
571
29.0k
        assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size);
572
29.0k
      } else {
573
2.80k
        alloc_prep->vt_blocks[i] = -1;
574
2.80k
        alloc_prep->blocks[j].buffer_ref = -1;
575
2.80k
        alloc_prep->blocks[j].offset = 0;
576
2.80k
      }
577
31.8k
      ++j;
578
31.8k
    } else
579
62.1k
      alloc_prep->vt_blocks[i] = -1;
580
6.09k
  ccfree(allocated_offset);
581
6.09k
  ccfree(assigned);
582
6.09k
  return alloc_prep;
583
6.09k
}
584
585
static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
586
6.09k
{
587
6.09k
  int i;
588
100k
  for (i = 0; i < alloc_prep->vt_block_size; 
i++93.9k
)
589
93.9k
    if (alloc_prep->alloc_dep[i])
590
12.8k
      ccv_array_free(alloc_prep->alloc_dep[i]);
591
22.0k
  for (i = 0; i < alloc_prep->buffer_size; 
i++15.9k
)
592
15.9k
    if (alloc_prep->buffers[i].dup_p_refs)
593
13
      ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
594
6.09k
  ccfree(alloc_prep->alloc_dep);
595
6.09k
  ccfree(alloc_prep);
596
6.09k
}
597
598
// Simple allocator from ccv_array_t.
599
static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
600
78.3k
{
601
78.3k
  int pos = tensor_metadata->rnum;
602
78.3k
  int rsize = (size + 15) / 16;
603
78.3k
  ccv_array_resize(tensor_metadata, pos + rsize);
604
78.3k
  return (pos << 1) + 1;
605
78.3k
}
606
607
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
608
166k
{
609
166k
  assert((pos >> 1) < tensor_metadata->rnum);
610
166k
  return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1);
611
166k
}
612
613
85.0k
#define CCV_NNC_IS_METADATA_POS(ptr) ((uintptr_t)(
ptr277
) & 1)
614
615
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
616
84.5k
{
617
84.5k
  // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
618
84.5k
  if (!CCV_NNC_IS_METADATA_POS(vt_tensor))
619
84.5k
    
return vt_tensor0
;
620
84.5k
  ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
621
84.5k
  if (tensor->alias_ref && 
CCV_NNC_IS_METADATA_POS97
(tensor->alias_ref))
622
84.5k
  {
623
83
    const int alias_ref = tensor->alias_ref;
624
83
    tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
625
83
    _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
626
83
  }
627
84.5k
  if (CCV_IS_TENSOR_MULTIVIEW(tensor))
628
84.5k
  {
629
75
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
630
75
    int i;
631
75
    const int count = mv->kind + mv->repeat;
632
240
    for (i = 0; i < count; 
i++165
)
633
165
    {
634
165
      if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
635
165
      {
636
147
        const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i];
637
147
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
638
147
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
639
147
      }
640
165
    }
641
75
    // No need to recursively do parent pointer, otherwise we are in deep rewire.
642
75
    if (mv->p && 
CCV_NNC_IS_METADATA_POS9
(mv->p))
643
75
      
mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p)9
;
644
75
    if (mv->sp)
645
65
      
for (i = 0; 28
i < mv->sp->rnum;
i++37
)
646
37
      {
647
37
        ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
648
37
        if (CCV_NNC_IS_METADATA_POS(*tensor))
649
37
        {
650
30
          const int pos = (int)(intptr_t)*tensor;
651
30
          *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
652
30
          assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
653
30
          _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
654
30
        }
655
37
      }
656
75
  }
657
84.5k
  return tensor;
658
84.5k
}
659
660
typedef struct {
661
  const uint8_t* ptr;
662
  int pos;
663
} ccv_nnc_tensor_block_pos_t;
664
665
static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
666
114
{
667
114
  int i;
668
114
  int unref_block_ref = block_ref;
669
117
  while (prep->tensor_blocks[unref_block_ref].ref)
670
3
    unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
671
114
  int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
672
114
  assert(vt_ref >= 0);
673
114
  assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref);
674
114
  const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
675
114
  uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
676
114
  int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
677
114
  for (i = idx - 1; i >= 0; 
i--0
)
678
114
  {
679
114
    assert(p_ref >= 0);
680
114
    const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
681
114
    const int unroll_count = graph_prep->unroll_count;
682
114
    if (ch[i]) // Prefer the dup side of things.
683
12
      p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
684
114
    int unref_p_ref = p_ref;
685
114
    while (graph_prep->tensor_blocks[unref_p_ref].ref)
686
0
      unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
687
114
    vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
688
114
    const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
689
114
    offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
690
114
    // If the buffer already exists, prefer that.
691
114
    const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
692
114
    if (ptr)
693
114
    {
694
114
      // If I have any remaining path that is not covered from 0, I cannot possibly
695
114
      // have any pointer from buffer (that can only happen if it is not dup).
696
138
      for (--i; i >= 0; 
i--24
)
697
24
        if (ch[i] != 0)
698
0
          return 0;
699
114
      // Try to find the created tensor block pos in the array, just linear scan.
700
114
      const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
701
114
      ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
702
114
      *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr + offset, params, 0);
703
114
      return tv_pos;
704
0
    }
705
0
    p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
706
0
  }
707
114
  
return 00
;
708
114
}
709
710
// Descent from root to the prep level, and compose multiview from there.
711
static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
712
114
{
713
114
  assert(pos_ref);
714
114
  int i;
715
114
  const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
716
114
  const int unroll_count = prep->unroll_count;
717
114
  if (prep == graph_prep)
718
57
  {
719
57
    const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
720
57
    if (!data_pos)
721
0
      return -1;
722
57
    // Based on ch, go all the way back to find the exact pointer to compose.
723
57
    if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
724
57
      prep->dup_tensor_block_ref &&
725
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] >= 041
&&
726
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref41
)
727
41
    {
728
41
      int pos[unroll_count + 1];
729
41
      pos[0] = data_pos;
730
98
      for (i = 0; i < unroll_count; 
i++57
)
731
57
        pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
732
41
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
733
41
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
734
41
      ccv_nnc_tensor_t* data[unroll_count + 1];
735
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
736
98
        data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
737
41
      ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
738
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
739
98
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
740
41
      *pos_ref = mv_pos;
741
41
    } else {
742
16
      *pos_ref = data_pos;
743
16
    }
744
57
    if (preserve)
745
5
    {
746
5
      // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
747
5
      // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
748
5
      // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
749
5
      // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
750
5
      // arena allocated).
751
5
      // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
752
5
      // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
753
5
      // it to a K01 structure.
754
5
      // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
755
5
      // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
756
5
      // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
757
5
      int prev_mv_pos = *pos_ref;
758
5
      if (prev_mv_pos == -1)
759
0
      {
760
0
        prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
761
0
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
762
0
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
763
0
        ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
764
0
          tv,
765
0
        }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
766
0
        CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
767
0
      }
768
5
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
769
5
      ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
770
5
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
771
5
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
772
5
        CCV_NNC_TENSOR_PLACEHOLDER,
773
5
        (ccv_nnc_tensor_t*)prev_mv,
774
5
      }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
775
5
      prev_mv->p = (void*)(intptr_t)mv_pos;
776
5
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
777
5
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
778
5
      *pos_ref = mv_pos;
779
5
    }
780
57
    return 0;
781
57
  }
782
57
  ch[idx] = 0;
783
57
  int pos[unroll_count + 1];
784
57
  pos[0] = 0;
785
57
  const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
786
57
  assert(retval == 0);
787
67
  
for (i = 0; 57
i < unroll_count;
i++10
)
788
10
  {
789
10
    ch[idx] = i + 1;
790
10
    pos[i + 1] = 0;
791
10
    const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
792
10
    if (dup_retval < 0)
793
0
    {
794
0
      assert(i == 0);
795
0
      break;
796
0
    }
797
10
  }
798
57
  // If current prep has no dup.
799
57
  if (i == 0)
800
47
  {
801
47
    *pos_ref = pos[0];
802
47
    return 0;
803
47
  }
804
10
  ccv_nnc_tensor_t* data[unroll_count + 1];
805
10
  // Compose to a new multiview.
806
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
807
20
    { assert(pos[i] > 0); }
808
10
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
809
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
810
20
    data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
811
10
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
812
10
  ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
813
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
814
20
    if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER && CCV_IS_TENSOR_MULTIVIEW(data[i]))
815
20
      
((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos4
;
816
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
817
20
    CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
818
10
  *pos_ref = mv_pos;
819
10
  return 0;
820
10
}
821
822
static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
823
312
{
824
312
  int i;
825
312
  int is_input = 0;
826
312
  assert(node);
827
766
  
for (i = 0; 312
i < node->input_size &&
!is_input529
;
i++454
)
828
454
    if (p_ref == node->inputs[i])
829
153
      is_input = 1;
830
312
  int is_output = 0;
831
725
  for (i = 0; i < node->output_size && 
!is_output465
;
i++413
)
832
413
    if (p_ref == node->outputs[i])
833
167
      is_output = 1;
834
312
  // Prefer it is an output if it is both the input and the output.
835
312
  if (is_output)
836
167
    return 1;
837
145
  if (is_input)
838
145
    return -1;
839
0
  return 0;
840
0
}
841
842
static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
843
61
{
844
61
  // No need to check whether to preserve if this is not a while loop.
845
61
  if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
846
8
    return 0;
847
53
  assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size);
848
53
  // If it is unassigned, no need to preserve.
849
53
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
850
53
    
return 02
;
851
51
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
852
51
  // If p is not input, no need to preserve at all.
853
51
  if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
854
19
    return 0;
855
32
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
856
32
  assert(vt_ref >= 0);
857
32
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
858
32
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
859
32
  // If the buffer is a truly read-only one, no need to preserve.
860
32
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
861
6
    return 0;
862
26
  /* This needs detailed explanation, what does preserve mean?
863
26
   * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
864
26
   * also used outside of the while loop, we cannot reuse the memory region of x for
865
26
   * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
866
26
   * y uses the same memory region as x). The way to workaround this is by using a different
867
26
   * memory region for y = x + 1, but for the first iteration, having x pointing to the
868
26
   * original. During the allocation process, the way to identify whether x should preserve
869
26
   * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
870
26
   * parent tensor is the same as the memory region it plans to use in the buffer, then we are
871
26
   * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
872
26
   * it is the input tensor whenever that is possible. A tensor block can point to two parent
873
26
   * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
874
26
   * tensor whenever that is possible. */
875
26
  if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
876
15
    return 0;
877
11
  // Otherwise, return 1 because we now need to preserve.
878
11
  return 1;
879
11
}
880
881
static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
882
58
{
883
58
  assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size);
884
58
  // If it is unassigned, no need to preserve.
885
58
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
886
58
    
return 00
;
887
58
  // Only tape var need to force broadcast, otherwise we already share the same memory region.
888
58
  if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
889
54
    return 0;
890
4
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
891
4
  // If p is not output, no need to broadcast at all.
892
4
  if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
893
3
    return 0;
894
1
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
895
1
  assert(vt_ref >= 0);
896
1
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
897
1
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
898
1
  // If the buffer is a truly read-only one, no need to broadcast.
899
1
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
900
0
    return 0;
901
1
  // Otherwise, return 1 because we now need to force broadcast for this tape var.
902
1
  return 1;
903
1
}
904
905
static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
906
25
{
907
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
908
25
  int i;
909
78
  for (i = 0; i < mv->kind + mv->repeat; 
i++53
)
910
53
    if (CCV_NNC_MULTIVIEW_DATA(mv)[i] == CCV_NNC_TENSOR_PLACEHOLDER)
911
53
      
CCV_NNC_MULTIVIEW_DATA8
(mv)[i] = tensor8
;
912
45
    else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
913
45
      
_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)7
CCV_NNC_MULTIVIEW_DATA7
(mv)[i], tensor);
914
25
}
915
916
static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
917
25
{
918
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
919
25
  int i;
920
25
  if (mv->sp)
921
8
    
for (i = 0; 2
i < mv->sp->rnum;
i++6
)
922
6
    {
923
6
      ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
924
6
      if (CCV_NNC_IS_METADATA_POS(*tensor))
925
6
      {
926
1
        const int pos = (int)(intptr_t)*tensor;
927
1
        *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
928
1
        assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
929
1
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
930
1
      }
931
6
    }
932
78
  
for (i = 0; 25
i < mv->kind + mv->repeat;
i++53
)
933
53
  {
934
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]))
935
53
      
CCV_NNC_MULTIVIEW_DATA8
(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)8
CCV_NNC_MULTIVIEW_DATA8
(mv)[i]);
936
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref))
937
53
      
CCV_NNC_MULTIVIEW_DATA0
(mv)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)0
CCV_NNC_MULTIVIEW_DATA0
(mv)[i]->alias_ref);
938
53
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
939
53
      
_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)7
CCV_NNC_MULTIVIEW_DATA7
(mv)[i]);
940
53
  }
941
25
}
942
943
static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
944
47
{
945
47
  // Go to the root of the graph.
946
47
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
947
47
  int i;
948
104
  for (i = 1; prep->p; 
i++57
)
949
57
    prep = prep->p;
950
47
  // Root graph should have no dup tensor blocks.
951
47
  assert(!prep->dup_tensor_block_ref);
952
47
  const int c = i;
953
47
  const ccv_nnc_symbolic_graph_prep_t* preps[c];
954
47
  prep = graph_prep;
955
47
  preps[c - 1] = prep;
956
104
  for (i = 0; prep->p; 
i++57
)
957
57
    preps[c - 2 - i] = prep = prep->p;
958
47
  int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
959
47
  memset(ch, 0, sizeof(int) * c);
960
47
  int pos = 0;
961
47
  _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
962
47
  assert(ch[c - 1] == 0); // This shouldn't never be modified.
963
47
  assert(pos > 0);
964
47
  return pos;
965
47
}
966
967
static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
968
3
{
969
3
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
970
3
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
971
3
  ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : 
tensor0
;
972
3
  ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
973
3
    CCV_NNC_TENSOR_PLACEHOLDER,
974
3
    tv,
975
3
  }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
976
3
  CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
977
3
  CCV_NNC_MULTIVIEW_DATA(mv)[1] = tensor;
978
3
  return mv_pos;
979
3
}
980
981
static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
982
30
{
983
30
  ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
984
30
  const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr);
985
30
  if (!is_multiview)
986
18
    return pos;
987
24
  
while (12
CCV_IS_TENSOR_MULTIVIEW(tensor_ptr))
988
12
  {
989
12
    const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
990
12
    tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
991
12
  }
992
12
  const ccv_nnc_tensor_t tensor = *tensor_ptr;
993
12
  const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
994
12
  ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
995
12
  *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
996
12
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
997
12
  new_tensor->alias_ref = (uintptr_t)pos;
998
12
  ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
999
12
  return new_pos;
1000
12
}
1001
1002
static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1003
6.09k
{
1004
6.09k
  // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1005
6.09k
  // Each tensor have the designation in assigned array, and offset in allocated_offset.
1006
6.09k
  const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1007
6.09k
  ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1008
6.09k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1009
6.09k
  const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1010
6.09k
  const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1011
6.09k
  const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? 
p_graph_prep->alloc_prep49
:
06.04k
;
1012
6.09k
  const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1013
6.09k
  const int unroll_count = graph_prep->unroll_count;
1014
6.09k
  int i, j;
1015
99.8k
  for (i = 0; i < tensor_symbol_info_size; 
i++93.7k
)
1016
93.8k
    
for (j = 0; 93.7k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) &&
j < unroll_count62.1k
;
j++7
)
1017
7
    {
1018
7
      const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1019
7
      if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref]))
1020
7
        
TENSOR_EXPECT_UNSET_UNASSIGNED3
(tensor_blocks[i]);
1021
7
    }
1022
6.09k
  ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1023
6.09k
  graph_prep->tensor_arena = tensor_arena;
1024
6.09k
  tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1025
6.09k
  tensor_arena->buffers = (void*)(tensor_arena + 1);
1026
6.09k
  tensor_arena->buffer_size = alloc_prep->buffer_size;
1027
6.09k
  tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1028
6.09k
  tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1029
6.09k
  tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1030
6.09k
  tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1031
6.09k
  tensor_arena->pb_vt_tensors = 0;
1032
6.09k
  tensor_arena->vt_alias_r_refs_p = 0;
1033
6.09k
  tensor_arena->vt_alias_r_refs = 0;
1034
6.09k
  tensor_arena->vt_sizes = 0;
1035
6.09k
  tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1036
6.09k
  tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
1037
6.09k
  tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1038
6.09k
  tensor_arena->allocator.context.free = allocator.context.free;
1039
6.09k
  tensor_arena->allocator.isa = allocator.isa;
1040
6.09k
  // Copy alias_ref info back to the tensor arena.
1041
99.8k
  for (i = 0; i < tensor_symbol_info_size; 
i++93.7k
)
1042
93.7k
    tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1043
6.09k
  // Do the buffer copies.
1044
22.0k
  for (i = 0; i < alloc_prep->buffer_size; 
i++15.9k
)
1045
15.9k
    tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1046
15.9k
      tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1047
15.9k
      tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1048
6.09k
  if (graph_prep->while_count_tensor)
1049
19
  {
1050
19
    // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1051
19
    int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1052
19
    assert((0 << 1) + 1 == pos); // pos must be 0 position.
1053
19
    ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1054
19
    *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1055
19
  }
1056
6.09k
  assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep));
1057
6.09k
  if (p_arena && 
p_graph_prep49
)
1058
49
  {
1059
49
    // Don't need to allocate the actual buffer, just use the pointer from the above.
1060
49
    PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena);
1061
229
    for (i = 0; i < tensor_arena->buffer_size; 
i++180
)
1062
180
    {
1063
180
      const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1064
180
      int unref_p_ref = p_ref;
1065
182
      while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1066
2
        unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1067
180
      assert(unref_p_ref >= 0);
1068
180
      const int p_unroll_count = p_graph_prep->unroll_count;
1069
180
      if (p_graph_prep->dup_tensor_block_ref &&
1070
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 016
&&
1071
180
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref16
)
1072
10
      {
1073
10
        // This condition means in the parent graph, we point to multiple tensor blocks for the same
1074
10
        // buffer, therefore, we cannot have one single pointer assigned in this case.
1075
10
        // Later we will handle this by generate ccv_tensor_multiview_t structure.
1076
10
        tensor_arena->buffers[i].ptr = 0;
1077
10
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1078
10
        continue;
1079
10
      }
1080
170
      // Otherwise, find the actual buffer pointer.
1081
170
      const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1082
170
      assert(vt_ref >= 0);
1083
170
      const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1084
170
      if (!p_arena->buffers[buffer_ref].ptr)
1085
0
      {
1086
0
        // Pass it down as 0 ptr.
1087
0
        tensor_arena->buffers[i].ptr = 0;
1088
0
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1089
0
        continue;
1090
0
      }
1091
170
      const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1092
170
      tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1093
170
      PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset);
1094
170
    }
1095
6.04k
  } else {
1096
6.04k
    // Now, allocate actual buffers.
1097
6.04k
    PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena);
1098
21.8k
    for (i = 0; i < tensor_arena->buffer_size; 
i++15.8k
)
1099
15.8k
    {
1100
15.8k
      const int buffer_type = tensor_arena->buffers[i].type;
1101
15.8k
      const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
1102
15.8k
#ifdef HAVE_CUDA
1103
15.8k
      if (memory_type == CCV_TENSOR_GPU_MEMORY)
1104
2.41k
      {
1105
2.41k
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
1106
2.41k
        if (allocator.isa && 
allocator.isa->alloc260
)
1107
260
          tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1108
2.15k
        else
1109
2.15k
          tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1110
2.41k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1111
13.3k
      } else {
1112
13.3k
        assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1113
13.3k
        if (tensor_arena->buffers[i].pin_mem)
1114
11
          tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1115
13.3k
        else
1116
13.3k
          
ccmemalign13.3k
((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size)13.3k
;
1117
13.3k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1118
13.3k
      }
1119
#else
1120
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1121
      ccmemalign((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size);
1122
      PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1123
#endif
1124
15.8k
      assert(tensor_arena->buffers[i].ptr);
1125
15.8k
    }
1126
6.04k
  }
1127
6.09k
  // Go over sub_preps and allocate arenas for them. Do it this early because
1128
6.09k
  // we may reference tensors from sub arenas, the reason why we need to reference
1129
6.09k
  // tensors from sub arenas is because for output tensors, sub arena's tensor
1130
6.09k
  // will have automatic reference updates.
1131
6.14k
  
for (i = 0; 6.09k
i < tensor_arena->sub_arena_size;
i++50
)
1132
50
    if (graph_prep->sub_preps[i])
1133
49
      tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1134
1
    else
1135
1
      tensor_arena->sub_arenas[i] = 0;
1136
6.09k
  memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1137
6.09k
  // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1138
6.09k
  ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? 
(ccv_nnc_tensor_t**)29
cccalloc29
(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) :
06.06k
;
1139
6.14k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1140
50
    if (tensor_arena->sub_arenas[i])
1141
49
    {
1142
49
      assert(graph_prep->sub_preps[i]);
1143
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1144
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1145
49
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1146
45
        
for (j = 0; 21
j < node->output_size;
j++24
)
1147
24
        {
1148
24
          const int idx = node->outputs[j];
1149
24
          const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i) - 1;
1150
24
          assert(s_idx >= 0);
1151
24
          ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1152
24
          assert(sub_arena_out_tensors[idx] == 0);
1153
24
          ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1154
24
          // Only assign if it is a multiview tensor.
1155
24
          if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor) ||
1156
24
            
(8
sub_alias8
&&
CCV_IS_TENSOR_MULTIVIEW1
(sub_alias)))
1157
17
            sub_arena_out_tensors[idx] = sub_tensor;
1158
24
        }
1159
49
    }
1160
6.09k
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
1161
99.8k
  
for (i = 0; 6.09k
i < tensor_symbol_info_size;
i++93.7k
)
1162
93.7k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
1163
93.7k
    {
1164
28.8k
      const int vt_ref = alloc_prep->vt_blocks[i];
1165
28.8k
      const int buffer_ref = vt_ref >= 0 ? 
alloc_prep->blocks[vt_ref].buffer_ref28.8k
:
-13
;
1166
28.8k
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1167
28.8k
      // previous layer, therefore, cannot really find the buffer ptr.
1168
28.8k
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[i]103
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1169
28.8k
        
(28.8k
(28.8k
graph_prep->dup_tensor_block_ref28.8k
&&
1170
28.8k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] >= 059
&&
1171
28.8k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] != i57
) ||
1172
28.8k
         
(28.8k
buffer_ref >= 028.8k
&&
!tensor_arena->buffers[buffer_ref].ptr28.8k
)))
1173
47
      {
1174
47
        assert(graph_prep->p); // This must be in a sub-graph.
1175
47
        // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1176
47
        if (graph_prep->tensor_blocks[i].p_refs[0] && 
_ccv_nnc_tensor_block_check_preserve(graph_prep, i)36
)
1177
4
          continue;
1178
43
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1179
43
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1180
43
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1181
28.8k
      } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])) {
1182
28.8k
        // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1183
28.8k
        const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1184
28.8k
        // If already created, use the same tensor, and continue.
1185
28.8k
        // Having ptr.
1186
28.8k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1187
28.8k
        ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1188
28.8k
        // Also, set its allocations.
1189
28.8k
        // Since tensor view is bit compatible with tensor, we can just cast.
1190
28.8k
        *tensor = ccv_nnc_tensor(tensor_arena->buffers[buffer_ref].ptr + offset, tensor_symbol_info[i].info, 0);
1191
28.8k
        assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size);
1192
28.8k
        // If we need to force broadcast, we need to wrap it in a multiview.
1193
28.8k
        if (graph_prep->tensor_blocks[i].p_refs[0] &&
1194
28.8k
          
_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i)58
)
1195
1
        {
1196
1
          const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1197
1
          ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1198
1
          ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1199
1
          ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1200
1
            tv,
1201
1
          }, 0, 1, graph_prep->graph, mv);
1202
1
          CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1203
1
          pos = mv_pos;
1204
1
          ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1205
1
        }
1206
28.8k
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1207
28.8k
      }
1208
28.8k
    }
1209
6.09k
  // Handle binded tensors. First handle cases without aliases.
1210
53.5k
  
for (i = 0; 6.09k
i < tensor_bind_size;
i++47.4k
)
1211
47.4k
  {
1212
47.4k
    assert(tensor_binds[i].tensor);
1213
47.4k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1214
47.4k
    if (resolved_symbol.d >= 0)
1215
47.4k
    {
1216
47.4k
      int d = resolved_symbol.d;
1217
47.4k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1218
47.4k
        
continue1.01k
;
1219
46.4k
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1220
46.4k
      // It has nothing to do with alias.
1221
46.6k
      
while (46.4k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1222
146
        d = tensor_blocks[d].ref - 1;
1223
46.4k
      // For binded tensors, it shouldn't be assigned yet.
1224
46.4k
      // If it is assigned, the pointer should match the ones from the binded tensor.
1225
46.4k
      // This can only happen if an enforced in-place tensor is binded twice. If that
1226
46.4k
      // happens, we need to make sure it is binded to the same location.
1227
46.4k
      assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1228
46.4k
      // See above assertion.
1229
46.4k
      if (tensor_arena->vt_tensors[d])
1230
0
        continue;
1231
46.4k
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1232
46.4k
      {
1233
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1234
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1235
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1236
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1237
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1238
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1239
0
        if (ccv_nnc_dimension_count(otv->inc) > 0)
1240
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1241
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->inc[j]); }
1242
0
        else // if it doesn't have inc, it is OK to be just as a whole smaller or equal to the binded one.
1243
0
          { assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)); }
1244
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1245
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1246
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1247
46.4k
      } else {
1248
46.4k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1249
46.4k
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1250
46.4k
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.ptr, tensor_symbol_info[d].info, 0);
1251
46.4k
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1252
46.4k
      }
1253
46.4k
    }
1254
47.4k
  }
1255
6.09k
  // Handle binded tensors. We handle alias here so it can reference to binded tensors.
1256
53.5k
  
for (i = 0; 6.09k
i < tensor_bind_size;
i++47.4k
)
1257
47.4k
  {
1258
47.4k
    assert(tensor_binds[i].tensor);
1259
47.4k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1260
47.4k
    if (resolved_symbol.d >= 0)
1261
47.4k
    {
1262
47.4k
      int d = resolved_symbol.d;
1263
47.4k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
1264
47.4k
        
d = tensor_symbol_info[d].alias_ref - 11.01k
; // Bind back to the original.
1265
47.4k
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1266
47.4k
      // It has nothing to do with alias.
1267
47.6k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1268
146
        d = tensor_blocks[d].ref - 1;
1269
47.4k
      if (tensor_arena->vt_tensors[d])
1270
47.4k
        continue;
1271
1
      // Assert original alias has no ofs. Otherwise our binding will be problematic.
1272
13
      
for (j = 0; 1
j < CCV_NNC_MAX_DIM_ALLOC;
j++12
)
1273
12
        { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0); }
1274
1
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1275
1
      {
1276
0
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1277
0
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1278
0
        ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1279
0
        if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1280
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1281
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]); }
1282
0
        if (ccv_nnc_dimension_count(otv->inc) > 0)
1283
0
          for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++)
1284
0
            { assert(tensor_symbol_info[d].info.dim[j] <= otv->inc[j]); }
1285
0
        else // if it doesn't have inc, it is OK to be just as a whole smaller or equal to the binded one.
1286
0
          { assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)); }
1287
0
        memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1288
0
        memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1289
0
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1290
1
      } else {
1291
1
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1292
1
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1293
1
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.ptr, tensor_symbol_info[d].info, 0);
1294
1
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1295
1
      }
1296
1
    }
1297
47.4k
  }
1298
6.09k
  // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1299
99.8k
  
for (i = 0; 6.09k
i < tensor_symbol_info_size;
i++93.7k
)
1300
93.7k
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1301
93.7k
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_blocks[i].ref62.1k
&&
!tensor_arena->vt_tensors[i]6.16k
)
1302
6.16k
    {
1303
6.16k
      int ref = tensor_blocks[i].ref - 1;
1304
6.16k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
tensor_blocks[ref].ref149
)
1305
1
        ref = tensor_blocks[ref].ref - 1;
1306
6.16k
      assert(tensor_arena->vt_tensors[ref]);
1307
6.16k
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1308
6.16k
    }
1309
6.09k
  // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1310
6.09k
  if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1311
21
  {
1312
21
    assert(graph_prep->p);
1313
21
    const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1314
21
    const int p_idx = graph_prep->p_idx - 1;
1315
46
    for (i = 0; i < node->input_size; 
i++25
)
1316
25
    {
1317
25
      const int idx = node->inputs[i];
1318
25
      int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx) - 1;
1319
25
      assert(!tensor_blocks[block_ref].ref);
1320
25
      const int vt_ref = alloc_prep->vt_blocks[block_ref];
1321
25
      if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1322
18
        continue;
1323
7
      assert(vt_ref >= 0);
1324
7
      const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1325
7
      assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]));
1326
7
      assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]));
1327
7
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1328
7
      // previous layer, therefore, cannot really find the buffer ptr.
1329
7
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[block_ref]0
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1330
7
        ((graph_prep->dup_tensor_block_ref &&
1331
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 04
&&
1332
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref4
) ||
1333
7
         
!tensor_arena->buffers[buffer_ref].ptr3
))
1334
4
      {
1335
4
        // We haven't allocated anything for this yet.
1336
4
        assert(tensor_arena->vt_tensors[block_ref] == 0);
1337
4
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1338
4
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1339
4
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1340
4
      } else {
1341
3
        const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1342
3
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1343
3
        ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1344
3
      }
1345
7
    }
1346
21
  }
1347
6.09k
  // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1348
6.09k
  // This created the multi-view tensor to achieve that.
1349
99.8k
  
for (i = 0; 6.09k
i < tensor_symbol_info_size;
i++93.7k
)
1350
93.7k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1351
10
    {
1352
10
      const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1353
10
      // Create phi multi-view.
1354
10
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1355
10
      const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1356
10
      const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1357
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1358
10
      ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1359
10
      ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1360
10
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1361
10
        intv,
1362
10
        outv,
1363
10
      }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI, mv);
1364
10
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1365
10
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1366
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1367
10
      ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1368
10
    }
1369
6.09k
  // Now it is time to handle alias.
1370
37.9k
  for (i = 0; i < alloc_prep->block_size; 
i++31.8k
)
1371
31.8k
    if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1372
31.6k
    {
1373
31.6k
      const int block_ref = alloc_prep->blocks[i].block_ref;
1374
31.6k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1375
31.6k
      {
1376
2.78k
        // Assigning out the tensor aliases.
1377
2.78k
        assert(tensor_symbol_info[block_ref].alias_ref);
1378
2.78k
        const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1379
2.78k
        // It referenced to is not an alias.
1380
2.78k
        assert(tensor_arena->vt_tensors[alias_ref]);
1381
2.78k
        const int alias_pos = (int)(intptr_t)tensor_arena->vt_tensors[alias_ref];
1382
2.78k
        const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1383
2.78k
        assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr));
1384
2.78k
        // Will use that to determine whether insert reference or not.
1385
2.78k
        const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr);
1386
2.79k
        while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr))
1387
2.78k
        {
1388
13
          const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1389
13
          alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1390
13
        }
1391
2.78k
        const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1392
2.78k
        // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1393
2.78k
        int pos;
1394
2.78k
        if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1395
2.78k
          
memcmp(tensor_symbol_info[block_ref].inc, tensor_symbol_info[block_ref].info.dim, sizeof(int) * 2.75k
CCV_NNC_MAX_DIM_ALLOC2.75k
) == 0)
1396
2.71k
        {
1397
2.71k
          pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1398
2.71k
          ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1399
2.71k
          *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1400
2.71k
        } else {
1401
71
          pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1402
71
          ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1403
71
          // Otherwise initialize a tensor view
1404
71
          *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].inc);
1405
71
          tensor_view->alias_ref = (uintptr_t)alias_pos;
1406
71
        }
1407
2.78k
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1408
2.78k
        if (is_multiview)
1409
13
        {
1410
13
          ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1411
13
          ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1412
13
        }
1413
2.78k
      }
1414
31.6k
    }
1415
6.09k
  // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1416
6.14k
  
for (i = 0; 6.09k
i < tensor_arena->sub_arena_size;
i++50
)
1417
50
    if (tensor_arena->sub_arenas[i])
1418
49
    {
1419
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1420
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1421
138
      for (j = 0; j < node->input_size; 
j++89
)
1422
89
      {
1423
89
        const int idx = node->inputs[j];
1424
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1425
89
        if (s_idx < 0)
1426
23
          continue;
1427
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1428
66
        // Only do the replacement if it is a multi-view tensor.
1429
66
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1430
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor) &&
!18
TENSOR_EXPECT_UNASSIGNED18
(tensor_blocks[idx]))
1431
66
        {
1432
18
          // It cannot be binded tensor.
1433
18
          assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]));
1434
18
          const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1435
18
          const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1436
18
          ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? 
sub_arena_out_tensors[idx]1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)17
;
1437
18
          // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1438
18
          // to this tensor.
1439
18
          if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor))
1440
18
          {
1441
6
            const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1442
6
            ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1443
6
            ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? 
vt_tensor1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)5
);
1444
6
            ref_tensor->alias_ref = is_sub_arena_out_tensor ? 
(uintptr_t)vt_tensor1
:
(uintptr_t)vt_pos5
;
1445
6
            ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1446
6
            ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0]) ? 
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)5
CCV_NNC_MULTIVIEW_DATA5
(multiview)[0]) :
CCV_NNC_MULTIVIEW_DATA1
(multiview)[0]1
);
1447
6
            while (CCV_IS_TENSOR_MULTIVIEW(tv))
1448
6
              
tv = (ccv_nnc_tensor_t*)(0
CCV_NNC_IS_METADATA_POS0
(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) ?
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)0
CCV_NNC_MULTIVIEW_DATA0
((ccv_nnc_tensor_multiview_t*)tv)[0]) :
CCV_NNC_MULTIVIEW_DATA0
((ccv_nnc_tensor_multiview_t*)tv)[0]0
);
1449
6
            *ref_tensor = ccv_nnc_tensor(tv->data.ptr, tv->info, 0);
1450
6
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1451
6
          } else
1452
12
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? 
vt_tensor0
: (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1453
18
        }
1454
66
      }
1455
49
    }
1456
6.09k
  // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1457
6.09k
  // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1458
6.09k
  // when initialize case..of node, which will take the phi multi-view again.
1459
99.8k
  
for (i = 0; 6.09k
i < tensor_symbol_info_size;
i++93.7k
)
1460
93.7k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1461
10
    {
1462
10
      assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]));
1463
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1464
10
      assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI);
1465
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1466
10
    }
1467
6.09k
  // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1468
99.8k
  
for (i = 0; 6.09k
i < tensor_symbol_info_size;
i++93.7k
)
1469
93.7k
    if (tensor_arena->vt_tensors[i])
1470
84.3k
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1471
6.09k
  // Associate multiview tensors from sub arena to the parent.
1472
6.09k
  if (sub_arena_out_tensors)
1473
29
  {
1474
242
    for (i = 0; i < alloc_prep->block_size; 
i++213
)
1475
213
      if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1476
113
      {
1477
113
        const int block_ref = alloc_prep->blocks[i].block_ref;
1478
113
        if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))
1479
113
          
continue0
;
1480
113
        int sub_arena_ref = block_ref;
1481
113
        if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1482
113
        {
1483
10
          // Assigning out the tensor aliases.
1484
10
          assert(tensor_symbol_info[block_ref].alias_ref);
1485
10
          const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1486
10
          // It referenced to is not an alias.
1487
10
          assert(tensor_arena->vt_tensors[alias_ref]);
1488
10
          sub_arena_ref = alias_ref;
1489
10
          if (!sub_arena_out_tensors[sub_arena_ref])
1490
3
            continue;
1491
110
        }
1492
110
        if (!sub_arena_out_tensors[sub_arena_ref])
1493
86
          continue;
1494
24
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref]) ? 
sub_arena_out_tensors[sub_arena_ref]23
:
(ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref1
);
1495
24
        assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1496
24
        // This is only possible if the vt_tensors is a phi node.
1497
24
        if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1498
0
        {
1499
0
          // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1500
0
          ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1501
0
          assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI);
1502
0
          assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]));
1503
0
          CCV_NNC_MULTIVIEW_DATA(phi)[1]->alias_ref = (uintptr_t)mv;
1504
0
          ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)[1]);
1505
24
        } else {
1506
24
          tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1507
24
          ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1508
24
        }
1509
24
      }
1510
29
  }
1511
6.09k
  // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1512
6.09k
  // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1513
6.09k
  // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1514
6.09k
  // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1515
6.09k
  // to the output of assign_ref tensor.
1516
99.8k
  
for (i = 0; 6.09k
i < tensor_symbol_info_size;
i++93.7k
)
1517
93.7k
    if (tensor_arena->vt_tensors[i] && 
tensor_symbol_info[i].assign_ref84.3k
)
1518
25
    {
1519
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1520
25
      ccv_nnc_tensor_t* assign_tensor;
1521
25
      if (sub_arena_out_tensors && 
sub_arena_out_tensors[assign_ref]3
)
1522
0
        assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref]) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1523
25
      else
1524
25
        assign_tensor = tensor_arena->vt_tensors[assign_ref];
1525
25
      ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1526
25
    }
1527
6.09k
  // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1528
53.5k
  for (i = 0; i < tensor_bind_size; 
i++47.4k
)
1529
47.4k
  {
1530
47.4k
    assert(tensor_binds[i].tensor);
1531
47.4k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1532
47.4k
    if (resolved_symbol.d >= 0)
1533
47.4k
    {
1534
47.4k
      int d = resolved_symbol.d;
1535
47.4k
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
1536
47.4k
      // It has nothing to do with alias.
1537
47.6k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref46.6k
)
1538
146
        d = tensor_blocks[d].ref - 1;
1539
47.4k
      // Note we don't trace back on alias. This is intentional.
1540
47.4k
      assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1541
47.4k
    }
1542
47.4k
  }
1543
6.09k
  if (sub_arena_out_tensors)
1544
6.09k
    
ccfree29
(sub_arena_out_tensors)29
;
1545
6.09k
  // Rewire sub arena's tensor references.
1546
6.14k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1547
50
    if (tensor_arena->sub_arenas[i])
1548
49
    {
1549
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1550
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1551
138
      for (j = 0; j < node->input_size; 
j++89
)
1552
89
      {
1553
89
        const int idx = node->inputs[j];
1554
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1555
89
        if (s_idx < 0)
1556
23
          continue;
1557
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1558
66
        // Only do the replacement if it is a multi-view tensor.
1559
66
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1560
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor))
1561
66
        {
1562
18
          // This is binded tensor, bind it now.
1563
18
          if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx]))
1564
18
            
_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx])0
;
1565
18
          else
1566
18
            _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1567
18
        }
1568
66
      }
1569
49
    }
1570
6.09k
  return tensor_arena;
1571
6.09k
}
1572
1573
static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1574
17
{
1575
17
  assert(graph);
1576
17
  if ((intptr_t)graph == tensor_arena->graph_ref)
1577
7
  {
1578
7
    assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size);
1579
7
    return tensor_arena->vt_tensors[pair_ref];
1580
10
  }
1581
10
  int i;
1582
13
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++3
)
1583
10
    if (tensor_arena->sub_arenas[i])
1584
10
    {
1585
10
      ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1586
10
      if (tensor)
1587
7
        return tensor;
1588
10
    }
1589
10
  
return 03
;
1590
10
}
1591
1592
static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1593
7
{
1594
7
  if (!CCV_IS_TENSOR_MULTIVIEW(tensor))
1595
7
    
tensor->type |= CCV_TAPE_ALLOC5
;
1596
2
  else {
1597
2
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1598
2
    mv->type |= CCV_TAPE_ALLOC;
1599
2
    int i;
1600
5
    for (i = 0; i < mv->repeat + mv->kind; 
i++3
)
1601
3
      _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1602
2
  }
1603
7
}
1604
1605
static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1606
6.09k
{
1607
6.09k
  assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
1608
6.09k
  int i;
1609
99.8k
  for (i = 0; i < graph_prep->tensor_symbol_info_size; 
i++93.7k
)
1610
93.7k
  {
1611
93.7k
    if (graph_prep->tensor_symbol_info[i].pair_ref)
1612
7
    {
1613
7
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1614
7
      // No need to continue check this if it is from its pair.
1615
7
      continue;
1616
7
    }
1617
93.7k
    if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && 
tensor_arena->vt_tensors[i]7
)
1618
7
    {
1619
7
      // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1620
7
      if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i]))
1621
7
      {
1622
5
        const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1623
5
        if (vt_ref >= 0 &&
1624
5
          TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref]) == READ_ONLY)
1625
3
          continue;
1626
4
      }
1627
4
      _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1628
4
    }
1629
93.7k
  }
1630
6.14k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
1631
50
    if (graph_prep->sub_preps[i])
1632
49
      _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1633
6.09k
}
1634
1635
static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1636
132k
{
1637
132k
  int i, found = 0;
1638
132k
  // Try to insert head.
1639
132k
  ccv_array_t* head = tensor_blocks.head;
1640
132k
  assert(head);
1641
134k
  
for (i = 0; 132k
i < head->rnum;)
1642
62.2k
  {
1643
62.2k
    const int head_idx = *(int*)ccv_array_get(head, i);
1644
62.2k
    if (head_idx == idx)
1645
114
    {
1646
114
      found = 1;
1647
114
      break;
1648
114
    }
1649
62.1k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1650
62.1k
    if (cell.i32 && 
cell.i32[0] > 041
)
1651
41
    {
1652
41
      /* If the current node is the parent of the head node, check if we found it or not. */
1653
41
      /* If not found, replace the current one. */
1654
41
      if (!found)
1655
41
      {
1656
41
        found = 1;
1657
41
        *(int*)ccv_array_get(head, i) = idx;
1658
41
      } else {
1659
0
        /* Remove the current one, change the rnum. */
1660
0
        if (i < head->rnum - 1)
1661
0
          *(int*)ccv_array_get(head, i) = *(int*)ccv_array_get(head, head->rnum - 1);
1662
0
        --head->rnum;
1663
0
        continue;
1664
0
      }
1665
62.1k
    } else {
1666
62.1k
      // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1667
62.1k
      cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1668
62.1k
      if (cell.i32 && 
cell.i32[0] > 059.8k
)
1669
59.8k
      {
1670
59.8k
        found = 1;
1671
59.8k
        break;
1672
59.8k
      }
1673
2.27k
    }
1674
2.27k
    /* Advancing i. */
1675
2.27k
    ++i;
1676
2.27k
  }
1677
132k
  /* If not found, push this idx to the end of the array. */
1678
132k
  if (!found)
1679
72.5k
    ccv_array_push(head, &idx);
1680
132k
  // Try to insert tail.
1681
132k
  found = 0;
1682
132k
  ccv_array_t* tail = tensor_blocks.tail;
1683
132k
  assert(tail);
1684
191k
  
for (i = 0; 132k
i < tail->rnum;)
1685
63.4k
  {
1686
63.4k
    const int tail_idx = *(int*)ccv_array_get(tail, i);
1687
63.4k
    if (tail_idx == idx)
1688
4.46k
    {
1689
4.46k
      found = 1;
1690
4.46k
      break;
1691
4.46k
    }
1692
58.9k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1693
58.9k
    if (cell.i32 && 
cell.i32[0] > 056.4k
)
1694
56.4k
    {
1695
56.4k
      /* If the current node is the child of the tail node, check if we found it or not. */
1696
56.4k
      /* If not found, replace the current one. */
1697
56.4k
      if (!found)
1698
55.3k
      {
1699
55.3k
        found = 1;
1700
55.3k
        *(int*)ccv_array_get(tail, i) = idx;
1701
55.3k
      } else {
1702
1.08k
        /* Remove the current one, change the rnum. */
1703
1.08k
        *(int*)ccv_array_get(tail, i) = *(int*)ccv_array_get(tail, tail->rnum - 1);
1704
1.08k
        --tail->rnum;
1705
1.08k
        continue;
1706
1.08k
      }
1707
2.52k
    } else {
1708
2.52k
      // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1709
2.52k
      cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1710
2.52k
      if (cell.i32 && 
cell.i32[0] > 0110
)
1711
110
      {
1712
110
        found = 1;
1713
110
        break;
1714
110
      }
1715
57.7k
    }
1716
57.7k
    /* Advancing i. */
1717
57.7k
    ++i;
1718
57.7k
  }
1719
132k
  /* If not found, push this idx to the end of the array. */
1720
132k
  if (!found)
1721
72.6k
    ccv_array_push(tail, &idx);
1722
132k
}
1723
1724
ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1725
6.63k
{
1726
6.63k
  if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1727
6.53k
  {
1728
6.53k
    assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size);
1729
6.53k
    ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1730
6.53k
    if (tensor && 
CCV_IS_TENSOR_MULTIVIEW6.53k
(tensor))
1731
6.53k
    {
1732
11
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1733
22
      while (CCV_IS_TENSOR_MULTIVIEW(mv))
1734
11
        mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? 
mv->it1
:
CCV_NNC_MULTIVIEW_DATA10
(mv)[0]10
);
1735
11
      return (ccv_nnc_tensor_t*)mv;
1736
11
    }
1737
6.52k
    return tensor;
1738
6.52k
  }
1739
100
  int i;
1740
123
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++23
)
1741
99
    if (tensor_arena->sub_arenas[i])
1742
99
    {
1743
99
      ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1744
99
      if (tensor)
1745
76
        return tensor;
1746
99
    }
1747
100
  
return 024
;
1748
100
}
1749
1750
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1751
66.5k
{
1752
66.5k
  if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1753
66.5k
  {
1754
66.5k
    assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size);
1755
66.5k
    return graph_exec_arena->graph_execs[symbol.d];
1756
7
  }
1757
7
  int i;
1758
9
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++2
)
1759
7
    if (graph_exec_arena->sub_arenas[i])
1760
7
    {
1761
7
      ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1762
7
      if (!CCV_NO_GRAPH_EXEC(exec))
1763
7
        
return exec5
;
1764
7
    }
1765
7
  
return (ccv_nnc_graph_exec_t){}2
; // 0.
1766
7
}
1767
1768
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1769
9
{
1770
9
  return graph_exec_arena->source;
1771
9
}
1772
1773
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1774
9
{
1775
9
  return graph_exec_arena->destination;
1776
9
}
1777
1778
// Check whether the head is the beginning of this block.
1779
static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1780
50
{
1781
50
  assert(tensor_block->head);
1782
50
  return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0) == head_node);
1783
50
}
1784
1785
// Check whether the tail is the end of this block.
1786
static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
1787
39
{
1788
39
  assert(tensor_block->tail);
1789
39
  return (tensor_block->tail->rnum == 1 && 
*(int*)36
ccv_array_get36
(tensor_block->tail, 0) == tail_node);
1790
39
}
1791
1792
// Make two tensor blocks one. Return 1 if that happened.
1793
static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
1794
6.41k
{
1795
6.41k
  // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
1796
6.41k
  if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]) &&
1797
6.41k
    
(6.38k
!6.38k
TENSOR_IS_UNFOLDABLE_AS_OUTPUT6.38k
(tensor_blocks[p_ref_1]) ||
tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 118
) &&
1798
6.41k
    
tensor_blocks[p_ref_0].tail->rnum == 16.37k
&&
1799
6.41k
    
tensor_blocks[p_ref_1].head->rnum == 16.37k
&&
1800
6.41k
    
tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type6.37k
&& // Must be the same type.
1801
6.41k
    
*(int*)6.36k
ccv_array_get6.36k
(tensor_blocks[p_ref_0].tail, 0) == *(int*)
ccv_array_get6.36k
(tensor_blocks[p_ref_1].head, 0))
1802
6.41k
  {
1803
6.17k
    // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
1804
6.17k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]));
1805
6.17k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]));
1806
6.17k
    ccv_array_free(tensor_blocks[p_ref_0].tail);
1807
6.17k
    tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
1808
6.17k
    if (tensor_blocks[p_ref_1].p_refs[0])
1809
14
    {
1810
14
      assert(tensor_blocks[p_ref_1].p_refs[1] == 0); // It simply cannot have more than one p_refs, otherwise we cannot merge.
1811
14
      if (!tensor_blocks[p_ref_0].p_refs[0])
1812
10
        tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
1813
4
      else
1814
4
        tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
1815
14
    }
1816
6.17k
    tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
1817
6.17k
    TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]));
1818
6.17k
    ccv_array_free(tensor_blocks[p_ref_1].head);
1819
6.17k
    if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1]))
1820
6.17k
      
TENSOR_SET_UNFOLDABLE_AS_INPUT16
(tensor_blocks[p_ref_0]);
1821
6.17k
    // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
1822
6.17k
    TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1]);
1823
6.17k
    tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
1824
6.17k
    if (!tensor_blocks[p_ref_0].r_refs)
1825
6.10k
      tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
1826
6.17k
    ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
1827
6.17k
    tensor_blocks[p_ref_1].size = 0;
1828
6.17k
    tensor_blocks[p_ref_1].head = 0;
1829
6.17k
    tensor_blocks[p_ref_1].tail = 0;
1830
6.17k
    return 1;
1831
245
  }
1832
245
  return 0;
1833
245
}
1834
1835
static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
1836
6.10k
{
1837
6.10k
  int i, j, k;
1838
6.10k
  // Generate exec dependencies (or, in other words, partial ordering of executions).
1839
6.10k
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
1840
6.10k
  int* buf = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
1841
6.10k
  int buf_size;
1842
6.10k
  if (p_node_info)
1843
62
    { assert(output_size == 0); }
1844
6.10k
#define for_block(x, val) \
1845
210k
  do { \
1846
210k
    if (((int32_t*)val)[0] > 0) \
1847
210k
    { \
1848
210k
      buf[buf_size * 2] = x; \
1849
210k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
1850
210k
      ++buf_size; \
1851
210k
    } \
1852
210k
  } while (0)
1853
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
1854
31.8k
    buf_size = 0; /* save all its parent deps to this buffer */
1855
31.8k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
1856
31.8k
    if (vector)
1857
210k
      
CCV_SPARSE_VECTOR_FOREACH25.4k
(exec_dep, vector, for_block);
1858
31.8k
    if (!node->outgoings)
1859
6.75k
      continue;
1860
53.0k
    
for (i = 0; 25.0k
i < node->outgoings->rnum;
i++28.0k
)
1861
28.0k
    {
1862
28.0k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
1863
28.0k
      const int32_t one = 1;
1864
28.0k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
1865
28.0k
      /* If not found, set, if the current node is the destination node, no need 
1866
28.0k
       * set itself as parent of subsequent nodes because its terminal nature. */
1867
28.0k
      if (!term && 
(27.4k
!cell.i3227.4k
||
cell.i32[0] == 00
))
1868
27.4k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
1869
264k
      for (j = 0; j < buf_size; 
j++236k
) /* set with all idx's dependencies as well */
1870
236k
      {
1871
236k
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
1872
236k
        /* If not found, set */
1873
236k
        if (!cell.i32 || 
cell.i32[0] == 031.9k
)
1874
204k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
1875
31.9k
        else {
1876
31.9k
          /* Otherwise, set to the longest one */
1877
31.9k
          int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
1878
31.9k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
1879
31.9k
        }
1880
236k
      }
1881
28.0k
    }
1882
25.0k
  } ccv_nnc_graph_visit_endfor
1883
6.10k
#undef for_block
1884
6.10k
  ccfree(buf);
1885
6.10k
  // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
1886
6.10k
  const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
1887
6.10k
  ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
1888
6.10k
  // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
1889
6.10k
  // happens that I have to loop through all relevant node to find out if one is used or not.
1890
100k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++93.9k
)
1891
93.9k
    tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
1892
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
1893
121k
    for (i = 0; i < node->input_size; 
i++89.8k
)
1894
89.8k
      if (node->inputs[i] >= 0)
1895
65.6k
      {
1896
65.6k
        tensor_blocks[node->inputs[i]].flags = 0;
1897
65.6k
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
1898
65.6k
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
1899
65.6k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
1900
65.6k
          
(57.5k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD57.5k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD57.5k
))
1901
12
          tensor_blocks[node->inputs[i]].pin_mem = 1;
1902
65.6k
      }
1903
81.6k
    for (i = 0; i < node->output_size; 
i++49.8k
)
1904
49.8k
      if (node->outputs[i] >= 0)
1905
42.9k
      {
1906
42.9k
        tensor_blocks[node->outputs[i]].flags = 0;
1907
42.9k
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
1908
42.9k
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
1909
42.9k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
1910
42.9k
          
(37.4k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD37.4k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD37.4k
))
1911
10
          tensor_blocks[node->outputs[i]].pin_mem = 1;
1912
42.9k
      }
1913
31.8k
  } ccv_nnc_graph_visit_endfor
1914
6.10k
  if (p_node_info)
1915
62
  {
1916
62
    assert(p_tensor_symbol_info);
1917
62
    // Mark it as used if it is used in either input or output.
1918
165
    
for (i = 0; 62
i < p_node_info->input_size;
i++103
)
1919
103
      if (p_node_info->inputs[i] >= 0)
1920
103
      {
1921
103
        const int d = p_node_info->inputs[i];
1922
103
        if (p_tensor_symbol_info[d].s_ref && 
p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx101
)
1923
92
        {
1924
92
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
1925
92
          if (dd >= 0) // If this exists in this sub-graph, great.
1926
80
            tensor_blocks[dd].flags = 0;
1927
92
        }
1928
103
      }
1929
132
    for (i = 0; i < p_node_info->output_size; 
i++70
)
1930
70
      if (p_node_info->outputs[i] >= 0)
1931
70
      {
1932
70
        const int d = p_node_info->outputs[i];
1933
70
        if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
1934
70
        {
1935
70
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
1936
70
          if (dd >= 0) // If this exists in this sub-graph, great.
1937
70
            tensor_blocks[dd].flags = 0;
1938
70
        }
1939
70
      }
1940
62
  }
1941
100k
  
for (i = 0; 6.10k
i < symbolic_graph->tensor_symbol_info->rnum;
i++93.9k
)
1942
93.9k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
1943
93.9k
    {
1944
73.0k
      // Check no tensor info is auto now.
1945
73.0k
      assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info));
1946
73.0k
      // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
1947
73.0k
      // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
1948
73.0k
      // fold to).
1949
73.0k
      if (tensor_symbol_info[i].assign_ref)
1950
40
      {
1951
40
        // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
1952
40
        // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
1953
40
        // it kept its own representation, which is not the case for output).
1954
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
1955
40
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1956
40
        // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
1957
40
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref]);
1958
40
        // It also cannot be folded as output (except i), because we need to keep its own representation.
1959
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref]);
1960
40
        assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0);
1961
40
        tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
1962
63
        for (j = 0; j < unroll_count; 
j++23
)
1963
23
        {
1964
23
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
1965
23
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
1966
23
        }
1967
40
        if (tensor_blocks[assign_ref].bypass_ref)
1968
4
        {
1969
4
          // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
1970
4
          tensor_blocks[assign_ref].unfoldable_except_ref = 0;
1971
4
          const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
1972
4
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref]);
1973
4
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref]);
1974
4
          // On the other hand, it can be folded into the except_ref for the bypass_ref.
1975
4
          tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
1976
4
          if (dup_tensor_from_ref)
1977
2
          {
1978
2
            const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
1979
2
            if (bypass_from_ref >= 0)
1980
2
            {
1981
2
              TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref]);
1982
2
              TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref]);
1983
2
              assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref);
1984
2
              for (j = 0; j < unroll_count - 1; 
j++0
)
1985
0
              {
1986
0
                // Mark every incarnation as unfold-able.
1987
0
                TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
1988
0
                TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
1989
0
              }
1990
2
            }
1991
2
          }
1992
4
        }
1993
40
      }
1994
73.0k
    }
1995
100k
  
for (i = 0; 6.10k
i < symbolic_graph->tensor_symbol_info->rnum;
i++93.9k
)
1996
93.9k
  {
1997
93.9k
    // If it has a pair reference, we don't need to allocate this tensor at all,
1998
93.9k
    // set it to be unassigned.
1999
93.9k
    if (tensor_symbol_info[i].pair_ref)
2000
93.9k
      
TENSOR_EXPECT_SET_UNASSIGNED15
(tensor_blocks[i]);
2001
93.9k
    // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2002
93.9k
    else 
if (93.9k
tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR93.9k
) {
2003
7
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2004
7
      TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
2005
7
      // For this case, there is no exception.
2006
7
      tensor_blocks[i].unfoldable_except_ref = 0;
2007
93.9k
    } else if (tensor_symbol_info[i].p_ref) {
2008
119
      assert(p_node_info);
2009
119
      const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2010
119
      // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2011
119
      if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2012
48
        // TODO: This check can be lifted if we can fold in the parent graph.
2013
48
        if (-1 == p_ref_is_in_or_out)
2014
48
          
TENSOR_SET_UNFOLDABLE_AS_INPUT20
(tensor_blocks[i]);
2015
119
      if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2016
119
        
TENSOR_SET_UNFOLDABLE_AS_INPUT68
(tensor_blocks[i]);
2017
119
    }
2018
93.9k
  }
2019
100k
  
for (i = 0; 6.10k
i < symbolic_graph->tensor_symbol_info->rnum;
i++93.9k
)
2020
93.9k
  {
2021
93.9k
    if (tensor_symbol_info[i].alias_ref)
2022
3.48k
    {
2023
3.48k
      const int ref = tensor_symbol_info[i].alias_ref - 1;
2024
3.48k
      // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2025
3.48k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
!1.70k
TENSOR_EXPECT_UNASSIGNED1.70k
(tensor_blocks[i]))
2026
3.48k
        
tensor_blocks[ref].flags = 01.03k
;
2027
3.48k
      // An alias cannot ref to another alias.
2028
3.48k
      assert(!tensor_symbol_info[ref].alias_ref);
2029
3.48k
      tensor_blocks[i].flags = ALIAS;
2030
3.48k
      tensor_blocks[i].ref = ref + 1; // Assign the ref.
2031
3.48k
      if (!tensor_blocks[ref].r_refs)
2032
3.44k
        tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2033
3.48k
      ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2034
3.48k
    }
2035
93.9k
  }
2036
6.10k
  // Scan again and if the ref is not assigned, mark the alias not assigned.
2037
100k
  
for (i = 0; 6.10k
i < symbolic_graph->tensor_symbol_info->rnum;
i++93.9k
)
2038
93.9k
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
2039
93.9k
    {
2040
3.48k
      const int ref = tensor_blocks[i].ref - 1;
2041
3.48k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]))
2042
3.48k
      {
2043
667
        // Mark this as unassigned.
2044
667
        tensor_blocks[i].flags = UNASSIGNED;
2045
667
        tensor_blocks[i].ref = 0;
2046
667
      }
2047
3.48k
    }
2048
100k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++93.9k
)
2049
93.9k
  {
2050
93.9k
    // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2051
93.9k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
2052
93.9k
    {
2053
71.2k
      tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2054
71.2k
      tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2055
71.2k
      // Cache tensor size (align to 16 bytes).
2056
71.2k
      tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2057
71.2k
    }
2058
93.9k
    // If there is a p_ref, add the one to the p_refs list.
2059
93.9k
    if (tensor_symbol_info[i].p_ref)
2060
128
      tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2061
93.9k
  }
2062
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2063
121k
    for (i = 0; i < node->input_size; 
i++89.8k
)
2064
89.8k
    {
2065
89.8k
      int d = node->inputs[i];
2066
89.8k
      if (d < 0)
2067
24.1k
        continue;
2068
65.6k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2069
65.6k
        
d = tensor_symbol_info[d].alias_ref - 11.69k
;
2070
65.6k
      tensor_blocks[d].flags |= READ_ONLY;
2071
65.6k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2072
65.6k
        
continue15
;
2073
65.6k
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2074
65.6k
      /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2075
65.6k
       * from the very beginning of the graph life-cycle and ends here. */
2076
65.6k
      if (tensor_blocks[d].head->rnum == 0 && 
!28.3k
TENSOR_REQUIRE_INIT28.3k
(tensor_symbol_info[d].flags))
2077
65.6k
      {
2078
88.9k
        for (j = 0; j < source_size; 
j++60.7k
)
2079
60.7k
        {
2080
60.7k
          // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2081
60.7k
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2082
60.7k
          if (cell.i32 && 
cell.i32[0] > 023.6k
)
2083
23.6k
            _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2084
60.7k
        }
2085
28.2k
        /* If this is a read-only (based on SSA, if first encountered as read), and this is
2086
28.2k
         * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2087
28.2k
         * loop, however, in that case, you need to prevent read-only gets reused for the
2088
28.2k
         * output tensor, which is not obvious how to implement correctly), and it is not
2089
28.2k
         * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2090
28.2k
         * of memory anyway (because on second loop, we want to read the same value out).
2091
28.2k
         * Mark it to the end of the graph. */
2092
28.2k
        if (p_node_info && 
!tensor_symbol_info[d].assign_ref146
)
2093
210
          
for (j = 0; 105
j < destination_size;
j++105
)
2094
105
          {
2095
105
            // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2096
105
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2097
105
            if (cell.i32 && 
cell.i32[0] > 065
)
2098
65
              _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2099
105
          }
2100
28.2k
      }
2101
65.6k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2102
65.6k
    }
2103
81.6k
    
for (i = 0; 31.8k
i < node->output_size;
i++49.8k
)
2104
49.8k
    {
2105
49.8k
      int d = node->outputs[i];
2106
49.8k
      if (d < 0)
2107
6.85k
        continue;
2108
42.9k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2109
42.9k
        
d = tensor_symbol_info[d].alias_ref - 11.35k
;
2110
42.9k
      tensor_blocks[d].flags |= WRITE_ONLY;
2111
42.9k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2112
42.9k
        
continue0
;
2113
42.9k
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2114
42.9k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2115
42.9k
    }
2116
31.8k
  } ccv_nnc_graph_visit_endfor
2117
6.10k
  // For any assign_ref, its life-time kept until the end and wrap over.
2118
100k
  
for (i = 0; 6.10k
i < symbolic_graph->tensor_symbol_info->rnum;
i++93.9k
)
2119
93.9k
    // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2120
93.9k
    // that "somewhere else" need to keep its life-time til the end.
2121
93.9k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) &&
2122
93.9k
      
p_node_info71.2k
&&
tensor_symbol_info[i].assign_ref282
)
2123
42
    {
2124
42
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2125
84
      for (j = 0; j < destination_size; 
j++42
)
2126
42
      {
2127
42
        // This logic is to be more conservative about which destination we add to.
2128
42
        // As of now, if we add everything, it is fine most likely. However, it may
2129
42
        // cause issues in the future to do so naively. Thus, instead, we only add
2130
42
        // the destination to it iff either the tensor is not used at all, or, the
2131
42
        // destination is on the same stream as of the tensor block some way.
2132
42
        int flag = !tensor_blocks[assign_ref].tail;
2133
83
        for (k = 0; !flag && 
k < tensor_blocks[assign_ref].tail->rnum73
;
k++41
)
2134
41
        {
2135
41
          const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k);
2136
41
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2137
41
          flag = (cell.i32 && 
cell.i32[0] > 010
);
2138
41
        }
2139
42
        if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2140
10
          _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2141
42
      }
2142
42
    }
2143
6.17k
  for (i = 0; i < output_size; 
i++68
)
2144
68
  {
2145
68
    assert(outputs[i].graph == symbolic_graph);
2146
68
    int d = outputs[i].d;
2147
68
    if (d < 0)
2148
0
      continue;
2149
68
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2150
68
      
d = tensor_symbol_info[d].alias_ref - 10
;
2151
68
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2152
68
      
continue0
;
2153
68
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2154
282
    
for (j = 0; 68
j < destination_size;
j++214
)
2155
214
    {
2156
214
      int flag = !tensor_blocks[d].tail;
2157
428
      for (k = 0; !flag && 
k < tensor_blocks[d].tail->rnum412
;
k++214
)
2158
214
      {
2159
214
        const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
2160
214
        const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2161
214
        flag = (cell.i32 && 
cell.i32[0] > 016
);
2162
214
      }
2163
214
      if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2164
16
        _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2165
214
    }
2166
68
  }
2167
6.10k
  // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2168
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2169
31.8k
    int x, y;
2170
121k
    for (x = 0; x < node->input_size; 
x++89.8k
)
2171
257k
      
for (y = 0; 89.8k
y < node->output_size;
y++167k
)
2172
167k
        /* Some operations enforces some tensors to be the same for inputs / outputs. */
2173
167k
        if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2174
180
        {
2175
180
          // If both unassigned, it is fine.
2176
180
          if (node->inputs[x] < 0 && 
node->outputs[y] < 00
)
2177
0
            continue;
2178
180
          int ref = node->inputs[x];
2179
180
          assert(ref >= 0);
2180
180
          while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref0
)
2181
0
            ref = tensor_blocks[ref].ref - 1;
2182
180
          const int node_output_y = node->outputs[y];
2183
180
          assert(node_output_y >= 0);
2184
180
          // If both are not computable, it is fine, we don't need to enforce.
2185
180
          if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2186
180
            
!0
TENSOR_EXPECT_COMPUTABLE0
(tensor_blocks[node_output_y]))
2187
180
            
continue0
;
2188
180
          // Otherwise, enforce and error out if failed.
2189
180
          if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2190
0
            { assert(0 && "cannot enforce inplace for the two tensors"); }
2191
180
        }
2192
31.8k
  } ccv_nnc_graph_visit_endfor
2193
6.10k
  // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2194
6.10k
  // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2195
6.10k
  // that is not enforced in-place (because the tensor enforced in-place will be different than the
2196
6.10k
  // binding one).
2197
53.5k
  
for (i = 0; 6.10k
i < tensor_bind_size;
i++47.4k
)
2198
47.4k
  {
2199
47.4k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2200
47.4k
    // If there is a tensor binded, then it is unassigned.
2201
47.4k
    if (resolved_symbol.d >= 0)
2202
47.4k
    {
2203
47.4k
      int d = resolved_symbol.d;
2204
47.4k
      // I cannot assert too much at this moment.
2205
47.4k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2206
47.4k
        
d = tensor_symbol_info[d].alias_ref - 11.01k
; // Bind back to the original.
2207
47.4k
      // This check is for in-place ops. Only in-place op could have unassigned but ref.
2208
47.4k
      // It has nothing to do with alias.
2209
47.6k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref11.5k
)
2210
146
        d = tensor_blocks[d].ref - 1;
2211
47.4k
      // Doesn't work if this is a loop carrying variable.
2212
47.4k
      assert(!tensor_symbol_info[d].assign_ref);
2213
47.4k
      tensor_blocks[d].flags = UNASSIGNED;
2214
47.4k
      tensor_blocks[d].ref = 0; // No need to have ref as well.
2215
47.4k
    }
2216
47.4k
  }
2217
6.10k
  // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2218
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2219
31.8k
    int x, y;
2220
121k
    for (x = 0; x < node->input_size; 
x++89.8k
)
2221
89.8k
    {
2222
89.8k
      /* If the input is not assigned, it can be referenced, find the referenced one */
2223
89.8k
      int ref = node->inputs[x];
2224
89.8k
      if (ref < 0)
2225
24.1k
        continue;
2226
72.8k
      
while (65.6k
!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
tensor_blocks[ref].ref40.1k
)
2227
7.15k
        ref = tensor_blocks[ref].ref - 1;
2228
65.6k
      assert(tensor_blocks[ref].ref == 0);
2229
65.6k
      const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2230
65.6k
      if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2231
65.6k
        
tensor_blocks[ref].tail->rnum == 132.7k
)
2232
84.4k
        
for (y = 0; 32.2k
y < node->output_size;
y++52.1k
)
2233
52.1k
          /* Only proceed if the input symbol is different from the output symbol, */
2234
52.1k
          /* and the input symbol meets the output symbol exactly at the same spot. */
2235
52.1k
          if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2236
52.1k
            
node->outputs[y] >= 013.0k
&&
2237
52.1k
            
ref != node->outputs[y]13.0k
&&
2238
52.1k
            
TENSOR_EXPECT_COMPUTABLE13.0k
(tensor_blocks[node->outputs[y]]))
2239
52.1k
          {
2240
6.30k
            const int node_output_y = node->outputs[y];
2241
6.30k
            const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2242
6.30k
            /* If dimension matches perfectly, then we can assign y_symbol to x. */
2243
6.30k
            if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
2244
6.23k
              _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2245
6.30k
          }
2246
65.6k
    }
2247
31.8k
  } ccv_nnc_graph_visit_endfor
2248
6.10k
  // Specifically handle the bypass. This need to be done after the first pass.
2249
6.10k
  // I need to extend the bypass life-time to the same as the one I am going with.
2250
6.10k
  // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2251
6.10k
  ccv_nnc_tensor_block_t empty_block = {};
2252
6.10k
  empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2253
6.10k
  empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2254
31.8k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2255
31.8k
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2256
13
    {
2257
13
      int can_bypass = 1;
2258
28
      for (i = 0; can_bypass && 
i < node->output_size25
;
i++15
)
2259
15
      {
2260
15
        int d = node->outputs[i];
2261
15
        if (d < 0)
2262
0
          continue;
2263
15
        if (!tensor_blocks[d].bypass_ref)
2264
2
          continue;
2265
13
        while (tensor_blocks[d].ref)
2266
0
          d = tensor_blocks[d].ref - 1;
2267
13
        int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2268
14
        while (tensor_blocks[bypass_ref].ref)
2269
1
          bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2270
13
        // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2271
13
        if (!tensor_symbol_info[bypass_ref].assign_ref && 
!tensor_symbol_info[bypass_ref].r_assign_ref10
)
2272
10
          continue;
2273
3
        ccv_array_clear(empty_block.head);
2274
6
        for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; 
j++3
)
2275
3
          ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j));
2276
3
        ccv_array_clear(empty_block.tail);
2277
6
        for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; 
j++3
)
2278
3
          ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j));
2279
6
        for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++3
)
2280
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), empty_block);
2281
6
        for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++3
)
2282
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), empty_block);
2283
3
        // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2284
3
        assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref));
2285
3
        int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : 
tensor_symbol_info[bypass_ref].r_assign_ref - 10
;
2286
3
        while (tensor_blocks[b_ref].ref)
2287
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2288
3
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2289
3
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2290
3
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2291
3
        // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2292
3
        can_bypass = can_bypass && (a_hop_b || b_hop_a);
2293
3
      }
2294
13
      if (can_bypass)
2295
10
      {
2296
22
        for (i = 0; i < node->output_size; 
i++12
)
2297
12
        {
2298
12
          int d = node->outputs[i];
2299
12
          if (d < 0)
2300
0
            continue;
2301
12
          if (!tensor_blocks[d].bypass_ref)
2302
2
            continue;
2303
10
          while (tensor_blocks[d].ref)
2304
0
            d = tensor_blocks[d].ref - 1;
2305
10
          int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2306
10
          while (tensor_blocks[bypass_ref].ref)
2307
0
            bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2308
10
          // The bypass_ref can extend its life-time.
2309
20
          for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++10
)
2310
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), tensor_blocks[bypass_ref]);
2311
20
          for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++10
)
2312
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), tensor_blocks[bypass_ref]);
2313
10
        }
2314
10
      } else {
2315
6
        for (i = 0; i < node->output_size; 
i++3
)
2316
3
          tensor_blocks[node->outputs[i]].bypass_ref = 0;
2317
3
        const int exec_idx = (dup_exec_from_ref) ? 
dup_exec_from_ref[idx]1
:
idx2
;
2318
3
        // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2319
3
        exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2320
3
      }
2321
13
    }
2322
31.8k
  } ccv_nnc_graph_visit_endfor
2323
6.10k
  ccv_array_free(empty_block.head);
2324
6.10k
  ccv_array_free(empty_block.tail);
2325
6.10k
  *r_exec_dep = exec_dep;
2326
6.10k
  *r_tensor_blocks = tensor_blocks;
2327
6.10k
}
2328
2329
static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2330
33
{
2331
33
  if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
cmd.cmd == CCV_NNC_GRAPH_BACKWARD30
)
2332
3
  {
2333
3
    ccv_nnc_cmd_t retval = cmd;
2334
3
    retval.cmd = CCV_NNC_NOOP;
2335
3
    return retval;
2336
3
  }
2337
30
  return cmd;
2338
30
}
2339
2340
static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2341
102
{
2342
102
  if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2343
47
  {
2344
47
    if (tensor_symbol_info[input].alias_ref)
2345
18
    {
2346
18
      const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2347
18
      assert(tensor_symbol_info[alias_ref].alias_ref == 0);
2348
18
      ccv_nnc_tensor_symbol_t tensor_symbol = {};
2349
18
      if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2350
6
      {
2351
6
        tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2352
6
        if (tensor_symbol_info[alias_ref].pair_ref)
2353
0
          ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2354
0
            .d = tensor_symbol_info[alias_ref].pair_ref - 1,
2355
0
            .graph = dup_graph->pair
2356
0
          });
2357
6
        ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2358
6
        dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2359
12
      } else {
2360
12
        tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2361
12
        tensor_symbol.graph = dup_graph;
2362
12
      }
2363
18
      ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].inc, tensor_symbol_info[input].info, 0);
2364
18
      if (tensor_symbol_info[input].pair_ref)
2365
0
        ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2366
0
          .d = tensor_symbol_info[input].pair_ref - 1,
2367
0
          .graph = dup_graph->pair
2368
0
        });
2369
18
      ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2370
18
      dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2371
29
    } else {
2372
29
      ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2373
29
      if (tensor_symbol_info[input].pair_ref)
2374
4
        ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2375
4
          .d = tensor_symbol_info[input].pair_ref - 1,
2376
4
          .graph = dup_graph->pair
2377
4
        });
2378
29
      ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2379
29
      dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2380
29
    }
2381
47
    if (tensor_symbol_info[input].bypass_ref)
2382
2
    {
2383
2
      const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2384
2
      assert(dup_bypass_ref >= 0);
2385
2
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count]);
2386
2
      symbol_info->bypass_ref = dup_bypass_ref + 1;
2387
2
    }
2388
47
  }
2389
102
  return (ccv_nnc_tensor_symbol_t) {
2390
102
    .d = dup_tensor_block_ref[input * unroll_count],
2391
102
    .graph = dup_graph,
2392
102
  };
2393
102
}
2394
2395
static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2396
72
{
2397
72
  int i;
2398
72
  if (dup_exec_ref[idx * unroll_count] < 0)
2399
44
  {
2400
44
    // Input has to come before output, because output could has a bypass reference to the input.
2401
116
    for (i = 0; i < node->input_size; 
i++72
)
2402
72
      max_inputs[i] = (node->inputs[i] >= 0) ? 
_ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i])71
:
(ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph }1
;
2403
75
    for (i = 0; i < node->output_size; 
i++31
)
2404
31
      max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : 
(ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph }0
;
2405
44
    ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2406
44
    dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2407
44
  }
2408
72
  return (ccv_nnc_graph_exec_symbol_t) {
2409
72
    .d = dup_exec_ref[idx * unroll_count],
2410
72
    .graph = dup_graph,
2411
72
  };
2412
72
}
2413
2414
static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2415
6.10k
{
2416
6.10k
  int i;
2417
100k
  for (i = 0; i < tensor_block_size; 
i++94.0k
)
2418
94.0k
  {
2419
94.0k
    if (tensor_blocks[i].head)
2420
65.1k
      ccv_array_free(tensor_blocks[i].head);
2421
94.0k
    if (tensor_blocks[i].tail)
2422
65.1k
      ccv_array_free(tensor_blocks[i].tail);
2423
94.0k
    if (tensor_blocks[i].r_refs)
2424
9.54k
      ccv_array_free(tensor_blocks[i].r_refs);
2425
94.0k
    if (tensor_blocks[i].dup_p_refs)
2426
22
      ccv_array_free(tensor_blocks[i].dup_p_refs);
2427
94.0k
  }
2428
6.10k
  ccfree(tensor_blocks);
2429
6.10k
}
2430
2431
// Find tensors that cannot be solved by co-allocating to the same location.
2432
static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2433
21
{
2434
21
  int i, j, unroll_count = 0;
2435
131
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++110
)
2436
110
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_symbol_info[i].assign_ref90
)
2437
25
    {
2438
25
      // This is is a parameter, thus, it has to be either an alias or used.
2439
25
      assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]));
2440
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2441
25
      // The parameter it assign to has to be either an alias or used.
2442
25
      assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]));
2443
25
      // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2444
25
      // If it is the same, we are good, no need to extend.
2445
25
      int a_ref = i;
2446
25
      while (tensor_blocks[a_ref].ref)
2447
0
        a_ref = tensor_blocks[a_ref].ref - 1;
2448
25
      int b_ref = assign_ref;
2449
31
      while (tensor_blocks[b_ref].ref)
2450
6
        b_ref = tensor_blocks[b_ref].ref - 1;
2451
25
      if (a_ref != b_ref)
2452
19
      {
2453
19
        // If any of the b's head is deterministically later than a's tail
2454
19
        // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2455
19
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2456
19
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2457
19
        // It cannot be that both i can hop to j can j can hop to i.
2458
19
        assert(!(a_hop_b > 0 && b_hop_a > 0));
2459
19
        // Can it be folded
2460
19
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2461
19
        if (a_hop_b || 
b_hop_a16
)
2462
3
        {
2463
3
          tensor_blocks[a_ref].companion_ref = b_ref + 1;
2464
3
          tensor_blocks[b_ref].companion_ref = a_ref + 1;
2465
3
          continue;
2466
3
        }
2467
16
        int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2468
20
        for (j = 0; c_ref >= 0; 
j++4
)
2469
4
        {
2470
4
          while (tensor_blocks[c_ref].ref)
2471
0
            c_ref = tensor_blocks[c_ref].ref - 1;
2472
4
          c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2473
4
        }
2474
16
        unroll_count = ccv_max(unroll_count, j + 1);
2475
16
      }
2476
25
    }
2477
21
  // Reset companion_ref if need to unroll.
2478
21
  if (unroll_count)
2479
91
    
for (j = 0; 13
j < symbolic_graph->tensor_symbol_info->rnum;
j++78
)
2480
78
      tensor_blocks[j].companion_ref = 0;
2481
21
  return unroll_count;
2482
21
}
2483
2484
static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2485
13
{
2486
13
  int i, j, n;
2487
13
  // The inout exec nodes, these are the nodes we are going to extend.
2488
13
  uint8_t* inout = (uint8_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2489
13
  int max_input_size = 0;
2490
13
  int max_output_size = 0;
2491
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2492
35
  {
2493
35
    max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size);
2494
35
    max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size);
2495
35
  }
2496
13
  ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)];
2497
13
  ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)];
2498
13
  // Doing graph expansion
2499
13
  // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2500
13
  assert(dup_graph->exec_symbol_info->rnum > 0);
2501
13
  assert(dup_graph->tensor_symbol_info->rnum > 0);
2502
88
#define INCOMING_NODE (1)
2503
28
#define OUTGOING_NODE (2)
2504
13
  // Unroll the graph n times.
2505
29
  
for (n = 0; 13
n < unroll_count;
n++16
)
2506
16
  {
2507
16
    int* const dup_exec_ref = r_dup_exec_ref + n;
2508
16
    const int* const prev_dup_tensor_block_ref = n > 0 ? 
r_dup_tensor_block_ref + (n - 1)3
:
013
;
2509
16
    int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2510
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2511
46
      dup_exec_ref[i * unroll_count] = -1;
2512
131
    for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++115
)
2513
115
    {
2514
115
      // If there is a assign_ref, that means I don't need to dup the tensor.
2515
115
      if (tensor_symbol_info[i].assign_ref)
2516
25
      {
2517
25
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2518
25
        dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? 
prev_dup_tensor_block_ref[assign_ref * unroll_count]8
:
assign_ref17
;
2519
90
      } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && 
TENSOR_READ_WRITE52
(tensor_blocks[i]) == READ_ONLY52
)
2520
26
      // If this is a read-only tensor block, no need to duplicate because the value never changes
2521
26
      // (note we handled assign_ref first), therefore, no need to generate duplicate.
2522
26
        dup_tensor_block_ref[i * unroll_count] = i;
2523
64
      else
2524
64
        dup_tensor_block_ref[i * unroll_count] = -1;
2525
115
    }
2526
16
    // Go through the original graph, make copies of the node if it is inout.
2527
44
    ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2528
44
      ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2529
44
      inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2530
44
      if (!node->outgoings)
2531
16
        continue;
2532
56
      
for (i = 0; 28
i < node->outgoings->rnum;
i++28
)
2533
28
      {
2534
28
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
2535
28
        inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2536
28
        ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2537
28
        ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2538
28
      }
2539
28
    } ccv_nnc_graph_visit_endfor
2540
16
    // Check the visitor are all marked as either incoming or outgoing.
2541
16
    const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2542
16
    const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2543
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2544
46
    {
2545
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2546
46
        
continue2
;
2547
44
      assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE));
2548
44
      // If this is pure incoming nodes, then I need to concat this one with all original destination node
2549
44
      if (inout[i] == INCOMING_NODE)
2550
44
        
for (j = 0; 16
j < dup_destination_size32
;
j++16
)
2551
16
        {
2552
16
          ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2553
16
            .d = dup_destinations[j].d,
2554
16
            .graph = dup_graph,
2555
16
          }, (ccv_nnc_graph_exec_symbol_t) {
2556
16
            .d = dup_exec_ref[i * unroll_count],
2557
16
            .graph = dup_graph,
2558
16
          });
2559
16
        }
2560
44
    }
2561
16
    if (dup_graph->destinations)
2562
16
      ccv_array_clear(dup_graph->destinations);
2563
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2564
46
    {
2565
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2566
46
        
continue2
;
2567
44
      const int d = dup_exec_ref[i * unroll_count];
2568
44
      ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d);
2569
44
      // If this has no outgoing node, add to the destination.
2570
44
      if (!exec_symbol_info->outgoings || 
exec_symbol_info->outgoings->rnum == 028
)
2571
16
        ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2572
16
          .graph = dup_graph,
2573
16
          .d = d,
2574
16
        });
2575
44
    }
2576
16
  }
2577
13
#undef INCOMING_NODE
2578
13
#undef OUTGOING_NODE
2579
13
  ccfree(inout);
2580
13
}
2581
2582
static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2583
13
{
2584
13
  int i;
2585
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2586
78
    // Now can assign them (The dup) as companion.
2587
78
    // Get to the last one, which we will wrap over.
2588
78
    if (dup_tensor_symbol_info[i].assign_ref)
2589
17
    {
2590
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2591
17
      dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2592
17
      assert(dup_tensor_symbol_info[i].assign_ref);
2593
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2594
17
    }
2595
13
}
2596
2597
// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2598
// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2599
// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2600
static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2601
21
{
2602
21
  int i, j, k;
2603
45
  for (i = 0; i < p_node_info->output_size; 
i++24
)
2604
24
  {
2605
24
    const int d = p_node_info->outputs[i];
2606
24
    const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx) - 1;
2607
24
    if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref]))
2608
24
      
continue6
;
2609
36
    
for (k = 0; 18
k < destination_size;
k++18
)
2610
18
      _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2611
18
    // Add the duplicated destinations to the tensor_block_ref.
2612
42
    for (j = 0; j < unroll_count; 
j++24
)
2613
48
      
for (k = 0; 24
k < destination_size;
k++24
)
2614
24
      {
2615
24
        const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2616
24
        const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2617
24
        if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2618
24
          _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2619
24
      }
2620
18
  }
2621
21
}
2622
2623
static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2624
21
{
2625
21
  int i, j;
2626
21
  ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2627
21
  ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2628
21
  // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2629
21
  // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2630
21
  // No need to change anything, we are good.
2631
21
  const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2632
21
  if (!unroll_count)
2633
8
    return;
2634
13
  // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2635
13
  // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2636
13
  ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2637
13
  int* dup_exec_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2638
13
  int* dup_tensor_block_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2639
13
  _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2640
13
  ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2641
13
  ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2642
26
  ccv_nnc_graph_visit_t* dup_visit = 
ccv_nnc_graph_visit_new13
(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0);
2643
26
  ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2644
26
  _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2645
26
  // Free out the old exec_dep
2646
26
  ccv_matrix_free(exec_dep);
2647
26
  // and the tensor blocks, prepare for the new.
2648
26
  _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2649
26
  // A reverse map to find where the original tensor comes from.
2650
26
  int* dup_tensor_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2651
142
  for (i = 0; i < dup_graph->tensor_symbol_info->rnum; 
i++129
)
2652
129
    dup_tensor_from_ref[i] = -1;
2653
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2654
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2655
115
      if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2656
104
        dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2657
26
  int* dup_exec_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2658
90
  for (i = 0; i < dup_graph->exec_symbol_info->rnum; 
i++77
)
2659
77
    dup_exec_from_ref[i] = -1;
2660
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2661
35
  {
2662
35
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2663
35
      
continue2
;
2664
33
    dup_exec_from_ref[i] = i; // Reference back.
2665
77
    for (j = 0; j < unroll_count; 
j++44
)
2666
44
      if (dup_exec_ref[i * unroll_count + j] >= 0)
2667
44
        dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2668
33
  }
2669
26
  // Reset all attr.
2670
26
  memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2671
26
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2672
26
  ccv_nnc_graph_visit_free(dup_visit);
2673
26
  
ccfree13
(dup_exec_symbol_info);
2674
26
  
ccfree13
(dup_exec_from_ref);
2675
26
  
ccfree13
(dup_tensor_from_ref);
2676
26
  // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2677
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2678
78
    // Loop over all possible duplications to assign dup_p_ref properly.
2679
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2680
115
    {
2681
115
      const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2682
115
      if (dup_idx >= 0 && 
(104
tensor_blocks[i].p_refs[0]104
||
tensor_blocks[i].p_refs[1]60
))
2683
44
      {
2684
44
        const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2685
44
        const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2686
44
        if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2687
28
        {
2688
28
          if (!tensor_blocks[dup_idx].dup_p_refs)
2689
22
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2690
28
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2691
28
        }
2692
44
        if (p_ref_0_is_in_or_out == 1 || 
tensor_blocks[i].p_refs[1] == 016
)
2693
44
          continue;
2694
0
        const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2695
0
        const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2696
0
        if (p_ref_1_is_in_or_out == 1)
2697
0
        {
2698
0
          if (!tensor_blocks[dup_idx].dup_p_refs)
2699
0
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2700
0
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2701
0
        }
2702
0
      }
2703
115
    }
2704
26
  // companion_ref
2705
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2706
78
    // Now can assign them (The dup) as companion.
2707
78
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
dup_tensor_symbol_info[i].assign_ref71
)
2708
17
    {
2709
17
      // Get to the last one, which we will wrap over.
2710
17
      const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2711
17
      if (assign_ref >= 0)
2712
17
      {
2713
17
        int b_ref = assign_ref;
2714
17
        while (tensor_blocks[b_ref].ref)
2715
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2716
17
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2717
17
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2718
17
        // It cannot be that both i can hop to j can j can hop to i.
2719
17
        // And it can be hop from one to another now after duplication.
2720
17
        assert(a_hop_b > 0 || b_hop_a > 0);
2721
17
        tensor_blocks[i].companion_ref = b_ref + 1;
2722
17
        tensor_blocks[b_ref].companion_ref = i + 1;
2723
17
      }
2724
17
    }
2725
26
  
ccfree13
(dup_tensor_symbol_info);
2726
13
  // Extend the dup tensor block ref, prepare for future extensions.
2727
13
  dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2728
110
  for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; 
i++97
)
2729
97
    dup_tensor_block_ref[i] = -1;
2730
13
  // Assign out changed properties.
2731
13
  *r_exec_dep = exec_dep;
2732
13
  *r_tensor_blocks = tensor_blocks;
2733
13
  *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2734
13
  *r_dup_graph = dup_graph;
2735
13
  *r_unroll_count = unroll_count;
2736
13
  *r_dup_exec_ref = dup_exec_ref;
2737
13
  *r_dup_tensor_block_ref = dup_tensor_block_ref;
2738
13
}
2739
2740
static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2741
31
{
2742
31
  if (!anonymous_block_free_list || 
!anonymous_block_free_list_cap15
)
2743
28
    return tensor_block_size;
2744
3
  int i;
2745
3
  const int no_dup_p_refs = (!dup_p_refs || 
!dup_p_refs->rnum0
);
2746
3
  int found_idx = tensor_block_size;
2747
9
  for (i = 0; i < anonymous_block_free_list_cap; 
i++6
)
2748
7
  {
2749
7
    const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i);
2750
7
    assert(idx < tensor_block_size);
2751
7
    // If the type doesn't match, ignore.
2752
7
    if (tensor_blocks[idx].type != type)
2753
0
      continue;
2754
7
    // Heuristic about how to select the best tensor block to move forward.
2755
7
    // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
2756
7
    if (tensor_blocks[idx].size >= size)
2757
1
    {
2758
1
      if (no_dup_p_refs)
2759
1
        return idx;
2760
0
      // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
2761
0
      // then we cannot do better than this, if that is the case, just return.
2762
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
2763
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
2764
0
        return idx;
2765
6
    }
2766
6
    int64_t found_idx_size_diff;
2767
6
    int64_t idx_size_diff;
2768
6
    if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
2769
6
      // Now, compare whether this one or the found_idx one is better.
2770
6
      // At this point, there is no point of comparing the dup_p_refs, we only care about which one
2771
6
      // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
2772
6
      
(found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size))4
)
2773
3
    {
2774
3
      found_idx = idx;
2775
3
      continue;
2776
3
    }
2777
3
    // No need to update if found_idx is better than idx.
2778
3
    if (found_idx_size_diff > idx_size_diff)
2779
0
      continue;
2780
3
    // We bias towards the bigger one in case of similar.
2781
3
    if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
2782
0
    {
2783
0
      found_idx = idx;
2784
0
      continue;
2785
0
    }
2786
3
    assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size);
2787
3
    // On a tie, check which one has tighter life-cycle.
2788
3
    if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
2789
0
    {
2790
0
      // Check whether the current tensor blocks life-cycle is longer than the previous one.
2791
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
2792
0
        (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
2793
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
2794
0
        found_idx = idx;
2795
0
      continue;
2796
0
    }
2797
3
    // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
2798
3
    // We prefer to choose the one that has life-cycle closer to the expected ones.
2799
3
    if (no_dup_p_refs)
2800
3
    {
2801
3
      // Whoever is shorter wins.
2802
3
      if (tensor_blocks[found_idx].dup_p_refs && 
tensor_blocks[found_idx].dup_p_refs->rnum > 00
&&
2803
3
        
(0
!tensor_blocks[idx].dup_p_refs0
||
!tensor_blocks[idx].dup_p_refs->rnum0
||
2804
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
2805
0
        found_idx = idx;
2806
3
      continue;
2807
3
    }
2808
0
    if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
2809
0
      continue;
2810
0
    if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
2811
0
    {
2812
0
      found_idx = idx;
2813
0
      continue;
2814
0
    }
2815
0
    // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
2816
0
    const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
2817
0
    const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
2818
0
    if (idx_after_request && found_idx_after_request)
2819
0
    {
2820
0
      if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
2821
0
        found_idx = idx;
2822
0
      continue;
2823
0
    } else {
2824
0
      // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
2825
0
      // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
2826
0
      // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
2827
0
      if (!found_idx_after_request && (idx_after_request ||
2828
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
2829
0
        found_idx = idx;
2830
0
      continue;
2831
0
    }
2832
0
  }
2833
3
  
return found_idx2
;
2834
3
}
2835
2836
static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
2837
49
{
2838
49
  if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
2839
28
    return 0;
2840
21
  int i, j, k;
2841
21
  int input_size = 0;
2842
43
  for (i = 0; i < p_node_info->p_while.input_size; 
i++22
)
2843
22
    if (p_node_info->p_while.inputs[i] >= 0)
2844
2
      ++input_size;
2845
21
  // If doesn't have tensor inputs (thus, only special inputs), just return.
2846
21
  if (!input_size)
2847
19
    return 0;
2848
2
  ccv_nnc_tensor_symbol_t inputs[input_size];
2849
2
  input_size = 0;
2850
6
  for (i = 0; i < p_node_info->p_while.input_size; 
i++4
)
2851
4
    if (p_node_info->p_while.inputs[i] >= 0)
2852
2
      inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
2853
2
        .d = p_node_info->p_while.inputs[i],
2854
2
        .graph = symbolic_graph,
2855
2
      };
2856
2
  assert(symbolic_graph->breakpoint_size > 0);
2857
2
  ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
2858
2
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
2859
4
  for (i = 0; i < symbolic_graph->breakpoint_size; 
i++2
)
2860
2
  {
2861
2
    // Make a noop copy of the breakpoint, but with some tensor inputs.
2862
2
    ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), inputs, input_size, 0, 0, 0);
2863
2
    ccv_array_push(dup_breakpoints, &noop);
2864
2
    // Connect this noop to the outgoing nodes of breakpoints.
2865
2
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d);
2866
2
    if (symbol_info->outgoings)
2867
4
      
for (j = 0; 2
j < symbol_info->outgoings->rnum;
j++2
)
2868
2
      {
2869
2
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
2870
2
        ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
2871
2
          .d = d,
2872
2
          .graph = symbolic_graph,
2873
2
        });
2874
2
      }
2875
2
  }
2876
7
  for (i = 0; i < exec_symbol_info_size; 
i++5
)
2877
5
  {
2878
5
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
2879
5
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags))
2880
5
      
continue0
;
2881
5
    if (symbol_info->outgoings)
2882
3
    {
2883
3
      const int outgoing_size = symbol_info->outgoings->rnum;
2884
6
      for (j = 0; j < outgoing_size; 
j++3
)
2885
3
      {
2886
3
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
2887
6
        for (k = 0; k < symbolic_graph->breakpoint_size; 
k++3
)
2888
3
          if (d == symbolic_graph->breakpoints[k].d)
2889
0
          {
2890
0
            ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k);
2891
0
            ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
2892
0
              .d = i,
2893
0
              .graph = symbolic_graph,
2894
0
            }, noop);
2895
0
            // Found, connected, exit.
2896
0
            break;
2897
0
          }
2898
3
      }
2899
3
    }
2900
5
  }
2901
2
  // Add the dup_breakpoints to source if neccessary.
2902
2
  assert(symbolic_graph->sources);
2903
2
  const int source_size = symbolic_graph->sources->rnum;
2904
4
  for (i = 0; i < source_size; 
i++2
)
2905
2
  {
2906
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i))->d;
2907
2
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++0
)
2908
2
      if (d == symbolic_graph->breakpoints[j].d)
2909
2
      {
2910
2
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
2911
2
        ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
2912
2
        // Found, made, exit.
2913
2
        break;
2914
2
      }
2915
2
  }
2916
2
  // Add the dup_breakpoints to destination if neccessary.
2917
2
  assert(symbolic_graph->destinations);
2918
2
  const int destination_size = symbolic_graph->destinations->rnum;
2919
4
  for (i = 0; i < destination_size; 
i++2
)
2920
2
  {
2921
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i))->d;
2922
4
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++2
)
2923
2
      if (d == symbolic_graph->breakpoints[j].d)
2924
0
      {
2925
0
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
2926
0
        ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
2927
0
        // Found, made, exit.
2928
0
        break;
2929
0
      }
2930
2
  }
2931
2
  return dup_breakpoints;
2932
2
}
2933
2934
// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
2935
static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
2936
6.09k
{
2937
6.09k
  assert(source_size > 0);
2938
6.09k
  assert(destination_size > 0);
2939
6.09k
  // First, fill all the "auto" holes.
2940
6.09k
  // This is the symbol table that with "auto" info filled up.
2941
6.09k
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
2942
6.09k
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
2943
6.09k
  ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
2944
12.1k
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new6.09k
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
2945
12.1k
  ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
2946
12.1k
  int i, j, k, p, q;
2947
12.1k
  const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = p_exec_symbol_info ? 
p_exec_symbol_info + (symbolic_graph->exec_idx - 1)49
:
06.04k
;
2948
12.1k
  ccv_sparse_matrix_t* exec_dep;
2949
12.1k
  ccv_nnc_tensor_block_t* tensor_blocks;
2950
12.1k
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
2951
12.1k
  int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2952
12.1k
  // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
2953
12.1k
  // are automatically filled in, and all the sub-graphs are processed.
2954
12.1k
  // There is a last step though, for a while loop, it is parameterized:
2955
12.1k
  // while (x > 5) {
2956
12.1k
  //     y = x + 1;
2957
12.1k
  // } (y => x) // This means after this loop is done, y's value will be copied over to x.
2958
12.1k
  // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
2959
12.1k
  // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
2960
12.1k
  // it is a inplace operation.
2961
12.1k
  // But if y cannot be x's alias, for example, this while loop looks like this:
2962
12.1k
  // while (x > 5) {
2963
12.1k
  //     y = x + a
2964
12.1k
  //     b = x + y
2965
12.1k
  // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
2966
12.1k
  // For this example, y cannot be x's alias because x is used later to compute b (and that computation
2967
12.1k
  // has dependency on y as well).
2968
12.1k
  // For this case, we need to modify the computation graph. Previously, the graph looks like this:
2969
12.1k
  // y = x + a -> b = x + y
2970
12.1k
  // This graph will be extended to look like this:
2971
12.1k
  // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
2972
12.1k
  // while (x0 > 5) {
2973
12.1k
  //     y0 = x0 + a0
2974
12.1k
  //     b0 = x0 + y0
2975
12.1k
  //     if (y0 > 5) break
2976
12.1k
  //     y1 = y0 + b0
2977
12.1k
  //     b1 = y0 + y1
2978
12.1k
  // } (y1 => x0, b1 => a0)
2979
12.1k
  // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
2980
12.1k
  // with each other now).
2981
12.1k
  // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
2982
12.1k
  // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
2983
12.1k
  ccv_nnc_symbolic_graph_t* dup_graph = 0;
2984
12.1k
  int* dup_exec_ref = 0;
2985
12.1k
  int* dup_tensor_block_ref = 0;
2986
12.1k
  int unroll_count = 0;
2987
12.1k
  // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
2988
12.1k
  ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)
ccmalloc6.09k
(sizeof(ccv_nnc_symbolic_graph_prep_t));
2989
12.1k
  prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
2990
12.1k
  prep->flags = 0;
2991
12.1k
  // Cannot handle dup a node that is a graph as well.
2992
12.1k
  if (
p_exec_symbol_info6.09k
)
2993
49
  {
2994
49
    prep->flags = p_node_info->flags;
2995
49
    if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
2996
21
    {
2997
21
      _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
2998
21
      _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
2999
28
    } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3000
28
      // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3001
28
    }
3002
49
  }
3003
12.1k
  ccv_nnc_symbolic_graph_prep_t** sub_preps = 
symbolic_graph->sub_graphs6.09k
&&
symbolic_graph->sub_graphs->rnum29
?
(ccv_nnc_symbolic_graph_prep_t**)29
cccalloc29
(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) :
06.06k
;
3004
12.1k
  ccv_array_t* anonymous_block_free_list = 0;
3005
12.1k
  const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3006
12.1k
  // Record whether this tensor is folded in this round.
3007
12.1k
  uint32_t* const tensor_fold = (uint32_t*)
ccmalloc6.09k
(sizeof(uint32_t) * tensor_fold_size);
3008
31.7k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
3009
31.8k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3010
49
    {
3011
49
      assert(symbolic_graph->sub_graphs);
3012
49
      ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1);
3013
49
      ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3014
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3015
49
      sub_prep->dup_breakpoints = dup_breakpoints;
3016
49
      sub_prep->p = prep;
3017
49
      sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1] = sub_prep;
3018
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3019
49
      const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3020
296
      for (i = 0; i < s_alloc_prep->block_size; 
i++247
)
3021
247
      {
3022
247
        const int block_ref = s_alloc_prep->blocks[i].block_ref;
3023
247
        const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3024
247
        if (block_ref < sub_prep->tensor_symbol_info_size)
3025
192
        {
3026
192
          // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3027
192
          // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3028
192
          if (s_tensor_blocks[block_ref].bypass_ref)
3029
1
          {
3030
1
            int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3031
1
            while (s_tensor_blocks[bypass_ref].ref)
3032
0
              bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3033
1
            if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3034
1
              
s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1]0
)
3035
1
              continue;
3036
191
          }
3037
191
          if (s_tensor_blocks[block_ref].p_refs[0])
3038
91
          {
3039
91
            /* If it is already properly assigned, next. */
3040
91
            if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3041
91
              s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3042
91
            {
3043
91
              if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3044
90
                s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3045
1
              else {
3046
1
                assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3047
1
                s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3048
1
              }
3049
91
            }
3050
91
            /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3051
91
            if (s_tensor_blocks[block_ref].p_refs[1] &&
3052
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1]3
&&
3053
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1]3
)
3054
3
            {
3055
3
              assert(s_alloc_prep->buffers[buffer_ref].p_refs[0]);
3056
3
              assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3057
3
              s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3058
3
            }
3059
91
          }
3060
191
        } else 
if (55
s_tensor_blocks[block_ref].dup_p_refs55
) {
3061
15
          /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3062
15
           * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3063
15
           * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3064
15
           * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3065
15
           * its life-time to the end of the output tensor. */
3066
15
          if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3067
13
            s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3068
33
          for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; 
j++18
)
3069
18
            ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j));
3070
15
        }
3071
247
      }
3072
49
    }
3073
31.7k
    const int init_tensor_block_size = tensor_block_size;
3074
31.7k
    int rw_anonymous_buffer_size_cap = 0;
3075
31.7k
    int ro_anonymous_buffer_size_cap = 0;
3076
31.7k
    if (anonymous_block_free_list)
3077
17
      ccv_array_clear(anonymous_block_free_list);
3078
31.7k
    memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3079
31.8k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3080
49
    {
3081
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1];
3082
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3083
49
      int rw_anonymous_buffer_size = 0;
3084
49
      int ro_anonymous_buffer_size = 0;
3085
229
      for (i = 0; i < s_alloc_prep->buffer_size; 
i++180
)
3086
180
        if (s_alloc_prep->buffers[i].p_refs[0])
3087
90
        {
3088
90
          /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3089
90
          int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3090
90
          /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3091
90
          int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3092
90
          assert(p_ref_0_is_in_or_out != 0);
3093
90
          int unref_p_ref_0 = p_ref_0;
3094
92
          while (tensor_blocks[unref_p_ref_0].ref)
3095
2
            unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3096
90
          /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3097
90
          assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3098
90
          if (s_alloc_prep->buffers[i].p_refs[1])
3099
4
          {
3100
4
            int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3101
4
            const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3102
4
            assert(p_ref_1_is_in_or_out != 0);
3103
4
            int unref_p_ref_1 = p_ref_1;
3104
4
            while (tensor_blocks[unref_p_ref_1].ref)
3105
0
              unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3106
4
            /* See above comment for the similar p_ref_0 check. */
3107
4
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]));
3108
4
            assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out);
3109
4
            int p_ref_t;
3110
4
            if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3111
3
            {
3112
3
              CCV_SWAP(p_ref_0, p_ref_1, p_ref_t);
3113
3
              CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t);
3114
3
            }
3115
4
            p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3116
4
            /* If the dimension matches, can fold. */
3117
4
            if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
3118
4
            {
3119
4
              const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3120
4
              if (folded)
3121
1
              {
3122
1
                p_ref_0 = p_ref_1;
3123
1
                unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3124
1
                tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3125
1
                for (j = 0; j < unroll_count; 
j++0
) /* Fold its duplicates as well. */
3126
0
                {
3127
0
                  const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3128
0
                  assert(folded && "the subsequent duplicates can be folded too.");
3129
0
                }
3130
1
              }
3131
4
            }
3132
4
          }
3133
90
          /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3134
90
           * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3135
90
           * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3136
90
           * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3137
90
           * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3138
90
           * associated with it, then we are good. */
3139
90
          if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3140
90
            
(89
p_ref_0_is_in_or_out == 189
&&
_ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)50
) ||
3141
90
            
(39
p_ref_0_is_in_or_out == -139
&&
_ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)39
) ||
3142
90
            
TENSOR_READ_WRITE8
(s_alloc_prep->buffers[i]) == READ_ONLY8
)
3143
86
          {
3144
86
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3145
27
              { assert(s_alloc_prep->buffers[i].p_refs[1] == 0); }
3146
86
            /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3147
86
             * is a long argument why that is the case, the digest is, it is much easier to control your output
3148
86
             * than your input). */
3149
86
            s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3150
86
            s_alloc_prep->buffers[i].p_refs[1] = 0;
3151
86
            /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3152
86
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3153
86
            tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size);
3154
95
            for (j = 0; j < unroll_count; 
j++9
) /* Change the size of its duplicates as well. */
3155
9
              tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3156
9
                tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3157
9
                  tensor_blocks[unref_p_ref_0].size;
3158
86
          } else {
3159
4
            s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3160
4
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3161
0
              ++ro_anonymous_buffer_size;
3162
4
            else
3163
4
              rw_anonymous_buffer_size += unroll_count + 1;
3164
4
          }
3165
90
        } else {
3166
90
          if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3167
63
            ++ro_anonymous_buffer_size;
3168
27
          else
3169
27
            rw_anonymous_buffer_size += unroll_count + 1;
3170
90
        }
3171
49
      if (ro_anonymous_buffer_size || 
rw_anonymous_buffer_size24
)
3172
28
      {
3173
28
        const int anonymous_block_free_list_cap = anonymous_block_free_list ? 
anonymous_block_free_list->rnum6
:
022
;
3174
28
        // All read-write buffer (potentially) can be reused between each case..of branch.
3175
28
        rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3176
28
        // Read-only buffer cannot be reused between each case..of branch.
3177
28
        ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3178
28
        /* Anonymous block, allocate additional tensor blocks for this. */
3179
28
        /* This is either because this is an internal tensor (don't have p_ref) */
3180
28
        /* or it is an anonymous block itself within the sub graphs of this while graph. */
3181
28
        tensor_blocks = (ccv_nnc_tensor_block_t*)ccrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3182
28
        memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3183
28
        if (dup_tensor_block_ref)
3184
3
          dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3185
174
        for (i = 0; i < s_alloc_prep->buffer_size; 
i++146
)
3186
146
          if (!s_alloc_prep->buffers[i].p_refs[0])
3187
94
          {
3188
94
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3189
63
            {
3190
63
              assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap);
3191
63
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size]);
3192
63
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3193
63
              tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3194
63
              tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3195
63
              tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3196
63
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3197
63
              tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3198
63
              ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3199
63
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3200
63
              if (dup_p_refs && 
dup_p_refs->rnum > 00
)
3201
0
              {
3202
0
                for (j = 0; j < dup_p_refs->rnum; j++)
3203
0
                {
3204
0
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3205
0
                  assert(dup_p_ref >= 0);
3206
0
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3207
0
                  assert(tensor_blocks[dup_p_ref].tail);
3208
0
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3209
0
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3210
0
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3211
0
                  {
3212
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3213
0
                    assert(p_node_info);
3214
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3215
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3216
0
                    {
3217
0
                      if (!tensor_blocks[tensor_block_size].dup_p_refs)
3218
0
                        tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3219
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3220
0
                    }
3221
0
                  }
3222
0
                  if (!tensor_blocks[tensor_block_size].tail)
3223
0
                    tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3224
0
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3225
0
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_size]);
3226
0
                }
3227
63
              } else {
3228
63
                tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3229
63
                ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3230
63
              }
3231
132
              
for (j = 0; 63
j < source_size;
j++69
)
3232
69
                _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3233
63
              /* If this is a read-only (based on SSA, if first encountered as read), and this is
3234
63
               * sub-graph. Mark it to the end of the graph. */
3235
63
              if (p_exec_symbol_info)
3236
12
                
for (j = 0; 6
j < destination_size;
j++6
)
3237
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3238
63
              /* If it is read-only, it is self-reflecting. */
3239
69
              for (k = 0; k < unroll_count; 
k++6
)
3240
6
              {
3241
12
                for (j = 0; j < destination_size; 
j++6
)
3242
6
                  if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3243
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3244
6
                /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3245
6
                assert(symbolic_graph->p);
3246
6
                dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3247
6
              }
3248
63
              ++tensor_block_size;
3249
63
            } else {
3250
31
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3251
31
              const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3252
31
              const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3253
31
              // Find suitable tensor block from the free list.
3254
31
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3255
31
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3256
31
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3257
31
              if (new_anonymous_tensor_block)
3258
28
              {
3259
28
                tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3260
28
                tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3261
28
                tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3262
28
                tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3263
28
                ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3264
28
              } else {
3265
3
                tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3266
3
                tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3267
3
              }
3268
31
              if (dup_p_refs && 
dup_p_refs->rnum > 04
)
3269
4
              {
3270
8
                for (j = 0; j < dup_p_refs->rnum; 
j++4
)
3271
4
                {
3272
4
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3273
4
                  assert(dup_p_ref >= 0);
3274
4
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3275
4
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3276
4
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3277
4
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3278
0
                  {
3279
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3280
0
                    assert(p_node_info);
3281
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3282
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3283
0
                    {
3284
0
                      if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3285
0
                        tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3286
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3287
0
                    }
3288
0
                  }
3289
4
                  assert(tensor_blocks[dup_p_ref].tail);
3290
4
                  if (!tensor_blocks[tensor_block_idx].tail)
3291
4
                    tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3292
8
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; 
k++4
)
3293
4
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_idx]);
3294
4
                  // We have to add it to the warp around companion_ref as well.
3295
4
                  // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3296
4
                  // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3297
4
                  // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3298
4
                  // gaurantee may be broken down in the line.
3299
4
                  if (tensor_blocks[dup_p_ref].companion_ref)
3300
0
                  {
3301
0
                    const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3302
0
                    for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3303
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3304
0
                    for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3305
0
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3306
0
                  }
3307
4
                }
3308
27
              } else if (new_anonymous_tensor_block) {
3309
24
                tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3310
24
                ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3311
24
              }
3312
31
              const int prev_tensor_block_idx = tensor_block_idx;
3313
31
              if (new_anonymous_tensor_block)
3314
28
              {
3315
28
                if (!anonymous_block_free_list)
3316
16
                  anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3317
28
                ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3318
28
                ++tensor_block_size;
3319
28
              }
3320
32
              for (k = 0; k < unroll_count; 
k++1
)
3321
1
              {
3322
1
                const int tensor_block_idx = new_anonymous_tensor_block ?
3323
1
                  (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3324
1
                  
dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k]0
;
3325
1
                TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3326
1
                TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3327
1
                if (new_anonymous_tensor_block)
3328
1
                {
3329
1
                  tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3330
1
                  tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3331
1
                  tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3332
1
                  tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3333
1
                  /* Attach to duplicated exec for this tensor block. */
3334
1
                  ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3335
1
                } else {
3336
0
                  tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3337
0
                  tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3338
0
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3339
0
3340
0
                }
3341
1
                if (dup_p_refs && dup_p_refs->rnum > 0)
3342
1
                {
3343
1
                  /* Not nil, not self-reflecting. */
3344
2
                  for (j = 0; j < dup_p_refs->rnum; 
j++1
)
3345
1
                  {
3346
1
                    const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3347
1
                    assert(dup_p_ref >= 0);
3348
1
                    assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3349
1
                    // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3350
1
                    // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3351
1
                    if (tensor_symbol_info[dup_p_ref].p_ref)
3352
0
                    {
3353
0
                      const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3354
0
                      assert(p_node_info);
3355
0
                      const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3356
0
                      if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3357
0
                      {
3358
0
                        if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3359
0
                          tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3360
0
                        ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3361
0
                      }
3362
0
                    }
3363
1
                    assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref);
3364
1
                    const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3365
1
                    assert(tensor_blocks[dup_dup_p_ref].tail);
3366
1
                    if (!tensor_blocks[tensor_block_idx].tail)
3367
1
                      tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3368
2
                    for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; 
q++1
)
3369
1
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q), tensor_blocks[tensor_block_idx]);
3370
1
                    // We have to add it to the warp around companion_ref as well.
3371
1
                    if (tensor_blocks[dup_dup_p_ref].companion_ref)
3372
0
                    {
3373
0
                      const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3374
0
                      for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3375
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3376
0
                      for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3377
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3378
0
                    }
3379
1
                  }
3380
1
                } else 
if (0
new_anonymous_tensor_block0
) {
3381
0
                  tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3382
0
                  ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3383
0
                }
3384
1
                if (new_anonymous_tensor_block)
3385
1
                  ++tensor_block_size;
3386
1
              }
3387
31
            }
3388
94
          }
3389
28
      }
3390
49
    }
3391
31.7k
  } ccv_nnc_graph_visit_endfor
3392
12.1k
  
if (6.09k
anonymous_block_free_list6.09k
)
3393
16
    ccv_array_free(anonymous_block_free_list);
3394
6.09k
  ccfree(tensor_fold);
3395
6.09k
  // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3396
6.09k
  // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3397
6.09k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new(exec_dep, tensor_blocks, tensor_block_size);
3398
6.09k
  ccv_matrix_free(exec_dep);
3399
6.09k
  prep->while_count_tensor = 0;
3400
6.09k
  prep->dup_breakpoints = 0;
3401
6.09k
  prep->p = 0;
3402
6.09k
  prep->symbolic_graph = symbolic_graph;
3403
6.09k
  prep->p_idx = symbolic_graph->p_idx;
3404
6.09k
  prep->exec_idx = symbolic_graph->exec_idx;
3405
6.09k
  prep->sub_prep_size = symbolic_graph->sub_graphs ? 
symbolic_graph->sub_graphs->rnum29
:
06.06k
;
3406
6.09k
  prep->sub_preps = sub_preps;
3407
6.09k
  prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3408
6.09k
  prep->exec_symbol_info = exec_symbol_info;
3409
6.09k
  prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3410
6.09k
  prep->tensor_symbol_info = tensor_symbol_info;
3411
6.09k
  prep->unroll_count = unroll_count;
3412
6.09k
  prep->dup_tensor_block_ref = dup_tensor_block_ref;
3413
6.09k
  prep->tensor_block_size = tensor_block_size;
3414
6.09k
  prep->tensor_blocks = tensor_blocks;
3415
6.09k
  prep->exec_flags = exec_flags;
3416
6.09k
  prep->visit = visit;
3417
6.09k
  prep->alloc_prep = alloc_prep;
3418
6.09k
  if (dup_graph)
3419
13
    ccv_nnc_symbolic_graph_free(dup_graph);
3420
6.09k
  if (dup_exec_ref)
3421
6.09k
    
ccfree13
(dup_exec_ref)13
;
3422
6.09k
  return prep;
3423
12.1k
}
3424
3425
static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3426
6.09k
{
3427
6.09k
  int i;
3428
6.09k
  _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3429
6.09k
  ccfree(prep->exec_flags);
3430
6.14k
  for (i = 0; i < prep->sub_prep_size; 
i++50
)
3431
50
    if (prep->sub_preps[i])
3432
49
      _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3433
6.09k
  if (prep->sub_preps)
3434
6.09k
    
ccfree29
(prep->sub_preps)29
;
3435
6.09k
  ccfree(prep->tensor_symbol_info);
3436
6.09k
  ccfree(prep->exec_symbol_info);
3437
6.09k
  if (prep->dup_tensor_block_ref)
3438
6.09k
    
ccfree13
(prep->dup_tensor_block_ref)13
;
3439
6.09k
  _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3440
6.09k
  ccv_nnc_graph_visit_free(prep->visit);
3441
6.09k
  ccfree(prep);
3442
6.09k
}
3443
3444
static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3445
6.09k
{
3446
6.09k
  int i, j;
3447
31.7k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
3448
31.7k
    if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3449
21
    {
3450
21
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3451
21
      assert(graph_ref >= 0);
3452
21
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3453
43
      for (i = 0; i < node->p_while.input_size; 
i++22
)
3454
22
        if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i]))
3455
22
        {
3456
20
          ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3457
20
          const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i]);
3458
21
          for (j = 0; j < d; 
j++1
)
3459
1
            prep = prep->p;
3460
20
          prep->while_count_tensor = 1;
3461
20
        }
3462
21
    }
3463
31.8k
    
for (i = 0; 31.7k
i < node->graph_ref_size;
i++49
)
3464
49
    {
3465
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3466
49
      if (graph_ref >= 0)
3467
49
        _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3468
49
    }
3469
31.7k
  } ccv_nnc_graph_visit_endfor
3470
6.09k
}
3471
3472
static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3473
89.7k
{
3474
89.7k
  if (symbol >= 0)
3475
65.5k
    return graph_prep->tensor_arena->vt_tensors[symbol];
3476
24.1k
  if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3477
24.1k
    return 0;
3478
20
  assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol));
3479
20
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3480
20
  int i;
3481
20
  const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol);
3482
21
  for (i = 0; i < d; 
i++1
)
3483
1
    prep = prep->p;
3484
20
  assert(prep->while_count_tensor);
3485
20
  return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3486
20
}
3487
3488
static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3489
6.09k
{
3490
6.09k
  int i;
3491
6.09k
  int* const exec_cvt = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
3492
6.09k
  ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3493
6.09k
  graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3494
6.09k
  graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3495
6.09k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3496
58.0k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++52.0k
)
3497
52.0k
    if (graph_execs[i].graph == graph)
3498
31.7k
      graph_execs[i].d = exec_cvt[graph_execs[i].d];
3499
6.09k
  ccfree(exec_cvt);
3500
6.09k
}
3501
3502
static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3503
6.09k
{
3504
6.09k
  int i, j, k;
3505
6.09k
  ccv_nnc_graph_t* const graph = graph_prep->graph;
3506
6.09k
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3507
6.09k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
3508
6.09k
  graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3509
6.09k
  graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3510
6.09k
  graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3511
6.09k
  graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3512
6.09k
  memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3513
6.09k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3514
6.09k
  int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3515
58.0k
  for (i = 0; i < exec_symbol_info_size; 
i++52.0k
)
3516
52.0k
  {
3517
52.0k
    max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size);
3518
52.0k
    max_output_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].output_size);
3519
52.0k
    if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3520
22
      max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size);
3521
52.0k
    graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3522
52.0k
    graph_execs[i].graph = 0;
3523
52.0k
  }
3524
6.14k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
3525
50
    max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size);
3526
6.09k
  ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)];
3527
6.09k
  ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)];
3528
6.09k
  ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)];
3529
6.09k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3530
6.09k
  const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3531
6.09k
  // Create node, this is in topological order.
3532
31.7k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3533
31.7k
    if (CCV_NO_GRAPH_EXEC(graph_execs[idx]))
3534
31.7k
    {
3535
121k
      for (i = 0; i < node->input_size; 
i++89.7k
)
3536
89.7k
        max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3537
81.5k
      for (i = 0; i < node->output_size; 
i++49.7k
)
3538
49.7k
        max_outputs[i] = node->outputs[i] >= 0 ? 
tensor_arena->vt_tensors[node->outputs[i]]42.9k
:
06.85k
;
3539
31.7k
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3540
21
      {
3541
21
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3542
21
        assert(graph_ref >= 0);
3543
21
        ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3544
21
        ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3545
21
        graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3546
21
        const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3547
21
        ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3548
21
        ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3549
43
        for (i = 0; i < node->p_while.input_size; 
i++22
)
3550
22
          max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3551
42
        for (i = 0; i < sub_symbolic_graph->breakpoint_size; 
i++21
)
3552
21
          max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3553
21
        ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3554
21
        _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3555
31.7k
      } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3556
24
        for (i = 0; i < node->output_size; 
i++13
)
3557
13
          if (max_outputs[i] && max_outputs[i]->alias_ref)
3558
10
            max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3559
11
        graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3560
11
        // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3561
22
        for (i = 0; i < node->case_of.argument.offset; 
i++11
)
3562
11
        {
3563
11
          ccv_nnc_tensor_t* const update = max_inputs[i];
3564
11
          if (!CCV_IS_TENSOR_MULTIVIEW(update)) // No need if it is a naked tensor.
3565
11
            
continue9
;
3566
2
          int flag = 0;
3567
2
          for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; 
j++0
)
3568
0
            flag = (update == max_inputs[j]);
3569
2
          if (!flag)
3570
2
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3571
2
        }
3572
11
        const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 
11
:
010
;
3573
11
        ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3574
11
        if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3575
1
        {
3576
1
          // Add another graph for data transfer.
3577
1
          ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3578
2
          for (i = 0; i < node->output_size; 
i++1
)
3579
1
            max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 
00
;
3580
1
          ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size), max_outputs, ccv_min(node->input_size, node->output_size));
3581
1
          ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3582
1
          ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3583
1
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3584
1
          int exec_cvt;
3585
1
          ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3586
1
        }
3587
39
        for (i = 0; i < node->graph_ref_size; 
i++28
)
3588
28
        {
3589
28
          const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3590
28
          if (graph_ref < 0)
3591
0
            continue;
3592
28
          ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3593
28
          const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3594
28
          ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3595
28
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3596
28
          _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3597
28
        }
3598
31.7k
      } else {
3599
31.7k
        graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3600
31.7k
      }
3601
31.7k
      ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3602
31.7k
    }
3603
31.7k
  } ccv_nnc_graph_visit_endfor
3604
6.09k
  // Then connect them.
3605
31.7k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3606
31.7k
    if (node->outgoings)
3607
52.9k
      
for (i = 0; 25.0k
i < node->outgoings->rnum;
i++27.9k
)
3608
27.9k
      {
3609
27.9k
        const int outgoing = *(int*)ccv_array_get(node->outgoings, i);
3610
27.9k
        if (graph_execs[outgoing].graph)
3611
27.3k
          ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3612
27.9k
      }
3613
31.7k
  } ccv_nnc_graph_visit_endfor
3614
6.09k
  int source_exec_created = 0;
3615
6.09k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3616
6.09k
  const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3617
6.09k
  ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3618
6.09k
  // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3619
99.8k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++93.7k
)
3620
93.7k
  {
3621
93.7k
    if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags))
3622
93.7k
    {
3623
121
      int ref = i;
3624
121
      while (tensor_symbol_info[ref].alias_ref)
3625
0
        ref = tensor_symbol_info[ref].alias_ref - 1;
3626
121
      while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref41
)
3627
0
        ref = tensor_blocks[ref].ref - 1;
3628
121
      // This is not computable. It could be that we marked a const tensor as init zero.
3629
121
      if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]))
3630
121
        
continue41
;
3631
80
      // If this tensor is not used by any exec, we don't need to init at all. Skip.
3632
80
      if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3633
0
        continue;
3634
80
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3635
80
      // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3636
80
      ccv_nnc_graph_exec_t set_exec;
3637
80
      if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3638
27
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3639
53
      else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3640
53
        set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3641
160
      for (j = 0; j < tensor_blocks[ref].head->rnum; 
j++80
)
3642
80
      {
3643
80
        const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j);
3644
80
        if (outgoing >= exec_symbol_info_size)
3645
0
          continue;
3646
80
        assert(outgoing >= 0);
3647
80
        assert(graph_execs[outgoing].graph);
3648
80
        ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3649
80
      }
3650
80
      int flags = 0;
3651
80
      if (alloc_dep[ref])
3652
24
        
for (j = 0; 12
j < alloc_dep[ref]->rnum;
j++12
)
3653
12
        {
3654
12
          const int d = *(int*)ccv_array_get(alloc_dep[ref], j);
3655
12
          // This is from alloc_dep, it should be computable.
3656
12
          assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
3657
12
          if (tensor_blocks[d].tail)
3658
24
            
for (k = 0; 12
k < tensor_blocks[d].tail->rnum;
k++12
)
3659
12
            {
3660
12
              const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
3661
12
              if (incoming >= exec_symbol_info_size)
3662
0
                continue;
3663
12
              assert(incoming >= 0);
3664
12
              assert(graph_execs[incoming].graph);
3665
12
              ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3666
12
              flags = 1;
3667
12
            }
3668
12
        }
3669
80
      // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3670
80
      if (!flags)
3671
68
      {
3672
68
        if (!source_exec_created)
3673
38
        {
3674
38
          graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3675
38
          source_exec_created = 1;
3676
38
        }
3677
68
        ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3678
68
      }
3679
80
    }
3680
93.7k
  }
3681
6.09k
  // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3682
6.09k
  // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3683
6.09k
  // with its alias).
3684
6.09k
  assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size);
3685
99.8k
  
for (i = 0; 6.09k
i < tensor_arena->vt_tensor_size;
i++93.7k
)
3686
93.7k
  {
3687
93.7k
    ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3688
93.7k
    // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3689
93.7k
    if (mv && 
CCV_IS_TENSOR_MULTIVIEW84.3k
(mv))
3690
93.7k
    {
3691
53
      const ccv_array_t* const head = tensor_blocks[i].head;
3692
53
      if (head && 
head->rnum > 047
)
3693
94
        
for (j = 0; 47
j < head->rnum;
j++47
)
3694
47
        {
3695
47
          const int idx = *(int*)ccv_array_get(head, j);
3696
47
          if (idx >= exec_symbol_info_size)
3697
1
            continue;
3698
46
          assert(idx >= 0);
3699
46
          const int d = graph_execs[idx].d;
3700
46
          ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d);
3701
46
          int flag = 0;
3702
46
          if (exec_info->tensor_wraps_ref)
3703
32
          {
3704
32
            ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1);
3705
113
            for (k = 0; k < tensor_wrap_array->size && 
!flag88
;
k++81
)
3706
81
              flag = (tensor_wrap_array->tensor_wraps[k] && 
tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv55
);
3707
32
          }
3708
46
          // If none is in the flag, it need to be included in the cast.
3709
46
          if (!flag)
3710
19
            ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3711
46
        }
3712
53
    }
3713
93.7k
  }
3714
6.09k
  // Create source / destination phony node. This is to facilitate use of compiled graph.
3715
6.09k
  // Also, this is needed if you have init zero execs.
3716
6.09k
  if (source_exec_created || 
source_size > 16.05k
)
3717
108
  {
3718
108
    if (!source_exec_created)
3719
70
      graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3720
489
    for (i = 0; i < source_size; 
i++381
)
3721
381
      ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3722
5.98k
  } else {
3723
5.98k
    assert(!source_exec_created);
3724
5.98k
    assert(source_size == 1);
3725
5.98k
    graph_exec_arena->source = graph_execs[sources[0].d];
3726
5.98k
  }
3727
6.09k
  if (destination_size == 1)
3728
6.01k
    graph_exec_arena->destination = graph_execs[destinations[0].d];
3729
77
  else {
3730
77
    graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3731
1.05k
    for (i = 0; i < destination_size; 
i++973
)
3732
973
      ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3733
77
  }
3734
6.09k
  ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3735
6.09k
  ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3736
6.09k
  return graph_exec_arena;
3737
6.09k
}
3738
3739
static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3740
11
{
3741
11
  if (graph_prep->symbolic_graph == pair)
3742
4
    return graph_prep->graph;
3743
7
  int i;
3744
10
  for (i = 0; i < graph_prep->sub_prep_size; 
i++3
)
3745
7
    if (graph_prep->sub_preps[i])
3746
7
    {
3747
7
      ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
3748
7
      if (graph)
3749
4
        return graph;
3750
7
    }
3751
7
  
return 03
;
3752
7
}
3753
3754
static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3755
6.04k
{
3756
6.04k
  int i;
3757
6.08k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++43
)
3758
43
    if (graph_prep->sub_preps[i])
3759
42
    {
3760
42
      if (graph_prep->sub_preps[i]->symbolic_graph->pair)
3761
4
        graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
3762
42
    }
3763
6.04k
}
3764
3765
static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3766
6.09k
{
3767
6.09k
  assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
3768
6.09k
  int i;
3769
58.0k
  for (i = 0; i < graph_prep->exec_symbol_info_size; 
i++52.0k
)
3770
52.0k
  {
3771
52.0k
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags))
3772
52.0k
      
continue9
;
3773
51.9k
    if (graph_exec_arena->graph_execs[i].graph && 
graph_prep->exec_symbol_info[i].pair_ref31.7k
)
3774
15.7k
    {
3775
15.7k
      ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
3776
15.7k
        .d = graph_prep->exec_symbol_info[i].pair_ref - 1,
3777
15.7k
        .graph = graph_prep->symbolic_graph->pair ? 
graph_prep->symbolic_graph->pair4
:
graph_prep->symbolic_graph15.7k
,
3778
15.7k
      });
3779
15.7k
      if (pair_exec.d >= 0)
3780
443
        ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
3781
15.7k
    }
3782
51.9k
  }
3783
6.14k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
3784
50
    if (graph_prep->sub_preps[i])
3785
49
      _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
3786
6.09k
}
3787
3788
static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3789
6.09k
{
3790
6.09k
  int i;
3791
6.09k
  if (graph_prep->dup_breakpoints)
3792
2
  {
3793
2
    // Strip the const modifier only possible because it is a sub-graph.
3794
2
    ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
3795
4
    for (i = 0; i < graph_prep->dup_breakpoints->rnum; 
i++2
)
3796
2
      ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i));
3797
2
    ccv_array_free(graph_prep->dup_breakpoints);
3798
2
    graph_prep->dup_breakpoints = 0;
3799
2
    graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3800
2
    // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
3801
2
    memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
3802
2
    // Since exec_symbol_info changed, create a new visit object.
3803
2
    assert(symbolic_graph->sources);
3804
2
    assert(symbolic_graph->destinations);
3805
2
    ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0);
3806
2
    const int source_size = symbolic_graph->sources->rnum;
3807
2
    ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0);
3808
2
    const int destination_size = symbolic_graph->destinations->rnum;
3809
4
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new2
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
3810
4
    ccv_nnc_graph_visit_free(graph_prep->visit);
3811
4
    graph_prep->visit = visit;
3812
4
    assert(graph_prep->p);
3813
4
    ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
3814
2
  }
3815
31.7k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
3816
31.8k
    for (i = 0; i < node->graph_ref_size; 
i++49
)
3817
49
    {
3818
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3819
49
      if (graph_ref >= 0)
3820
49
        _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
3821
49
    }
3822
31.7k
  } ccv_nnc_graph_visit_endfor
3823
6.09k
}
3824
3825
const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
3826
3827
void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
3828
6.04k
{
3829
6.04k
  assert(graph_ref);
3830
6.04k
  assert(tensor_arena_ref);
3831
6.04k
  assert(graph_exec_arena_ref);
3832
6.04k
  int i;
3833
6.04k
  // Cannot bind the multi-view.
3834
53.5k
  for (i = 0; i < tensor_bind_size; 
i++47.4k
)
3835
47.4k
  {
3836
47.4k
    assert(tensor_binds[i].tensor);
3837
47.4k
    assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor));
3838
47.4k
  }
3839
6.04k
  ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
3840
6.04k
  _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
3841
6.04k
  ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
3842
6.04k
  _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
3843
6.04k
  *tensor_arena_ref = tensor_arena;
3844
6.04k
  // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
3845
6.04k
  _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
3846
6.04k
  // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
3847
6.04k
  _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
3848
6.04k
  *graph_ref = graph_prep->graph;
3849
6.04k
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
3850
6.04k
  _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
3851
6.04k
  _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
3852
6.04k
  *graph_exec_arena_ref = graph_exec_arena;
3853
6.04k
  _ccv_nnc_symbolic_graph_prep_free(graph_prep);
3854
6.04k
}
3855
3856
static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
3857
6.09k
{
3858
6.09k
  // Buffers are inherited from above, no need to dealloc.
3859
6.09k
  int i;
3860
6.14k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
3861
50
    if (tensor_arena->sub_arenas[i])
3862
49
      _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
3863
6.15k
  for (i = 0; i < tensor_arena->m_tensor_idx->rnum; 
i++61
)
3864
61
  {
3865
61
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i));
3866
61
    assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv));
3867
61
    ccv_nnc_tensor_multiview_free(*mv);
3868
61
  }
3869
6.09k
  ccv_array_free(tensor_arena->tensor_metadata);
3870
6.09k
  ccv_array_free(tensor_arena->m_tensor_idx);
3871
6.09k
  if (tensor_arena->pb_vt_tensors)
3872
6.09k
    
ccfree64
(tensor_arena->pb_vt_tensors)64
;
3873
6.09k
  if (tensor_arena->vt_alias_r_refs_p)
3874
6.09k
    
ccfree64
(tensor_arena->vt_alias_r_refs_p)64
;
3875
6.09k
  if (tensor_arena->vt_sizes)
3876
6.09k
    
ccfree6
(tensor_arena->vt_sizes)6
;
3877
6.09k
  ccfree(tensor_arena);
3878
6.09k
}
3879
3880
void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
3881
83.0k
{
3882
83.0k
  assert(tensor_arena->graph_ref == (intptr_t)symbol.graph);
3883
83.0k
  assert(symbol.d < tensor_arena->vt_tensor_size);
3884
83.0k
  assert(symbol.d >= 0);
3885
83.0k
  // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
3886
83.0k
  int i;
3887
83.0k
  if (!tensor_arena->pb_vt_tensors)
3888
64
  {
3889
64
    tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
3890
7.43k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.37k
)
3891
7.37k
      if (tensor_arena->vt_tensors[i])
3892
6.15k
        tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
3893
64
  }
3894
83.0k
  if (!tensor_arena->vt_alias_r_refs_p)
3895
64
  {
3896
64
    tensor_arena->vt_alias_r_refs_p = (int*)cccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
3897
64
    tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
3898
7.43k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.37k
)
3899
7.37k
      if (tensor_arena->vt_alias_refs[i])
3900
555
      {
3901
555
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
3902
555
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
3903
555
        ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
3904
555
      }
3905
64
    int refp = 0;
3906
7.37k
    for (i = 1; i < tensor_arena->vt_tensor_size; 
i++7.31k
) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
3907
7.31k
      if (tensor_arena->vt_alias_r_refs_p[i])
3908
549
        refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
3909
6.76k
      else
3910
6.76k
        tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
3911
6.88k
    for (i = refp; i < tensor_arena->vt_tensor_size; 
i++6.82k
)
3912
6.82k
      tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
3913
7.43k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++7.37k
)
3914
7.37k
      if (tensor_arena->vt_alias_refs[i])
3915
555
      {
3916
555
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
3917
555
        assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size);
3918
555
        const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
3919
555
        assert(pos >= 0);
3920
555
        tensor_arena->vt_alias_r_refs[pos] = i;
3921
555
      }
3922
64
  }
3923
83.0k
  const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? 
tensor_arena->vt_alias_refs[symbol.d] - 11
:
symbol.d83.0k
;
3924
83.0k
  if (CCV_IS_TENSOR_VIEW(tensor))
3925
83.0k
  {
3926
0
    assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
3927
0
    assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->inc) == 0 &&
3928
0
          ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||
3929
0
        ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->inc) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info));
3930
0
  } else
3931
83.0k
    { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)); }
3932
83.0k
  if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d]))
3933
83.0k
    
{ assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0); }0
3934
83.0k
  tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
3935
83.0k
  if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
3936
12.7k
    
for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; 11.7k
i < tensor_arena->vt_tensor_size;
i++1.00k
)
3937
12.7k
    {
3938
12.7k
      const int d = tensor_arena->vt_alias_r_refs[i];
3939
12.7k
      if (d < 0 || 
symbol_d + 1 != tensor_arena->vt_alias_refs[d]2.65k
) // Doesn't match, reached the end of it.
3940
11.7k
        break;
3941
1.00k
      ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
3942
1.00k
      if (CCV_IS_TENSOR_VIEW(d_tensor))
3943
1.00k
        
d_tensor->data.u8 = tensor->data.u8 + ((ccv_nnc_tensor_view_t*)d_tensor)->off2
;
3944
1.00k
      else
3945
1.00k
        d_tensor->data.u8 = tensor->data.u8;
3946
1.00k
    }
3947
83.0k
}
3948
3949
void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
3950
14.5k
{
3951
14.5k
  if (!tensor_arena->pb_vt_tensors)
3952
32
    return;
3953
14.4k
  int i;
3954
478k
  for (i = 0; i < tensor_arena->vt_tensor_size; 
i++464k
)
3955
464k
    if (tensor_arena->vt_tensors[i])
3956
291k
      tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
3957
14.4k
}
3958
3959
uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
3960
2
{
3961
2
  uint64_t total_size = 0;
3962
2
  int i;
3963
36
  for (i = 0; i < tensor_arena->buffer_size; 
i++34
)
3964
34
    total_size += tensor_arena->buffers[i].size;
3965
2
  return total_size;
3966
2
}
3967
3968
static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
3969
0
{
3970
0
  int i;
3971
0
  if (mv->it)
3972
0
    mv->it->info = params;
3973
0
  for (i = 0; i < mv->repeat + mv->kind; i++)
3974
0
  {
3975
0
    ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i];
3976
0
    if (CCV_IS_TENSOR_MULTIVIEW(tensor))
3977
0
      _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
3978
0
    else
3979
0
      tensor->info = params;
3980
0
  }
3981
0
}
3982
3983
int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
3984
2.20k
{
3985
2.20k
  int i;
3986
2.20k
  assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size);
3987
2.20k
  if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
3988
6
  {
3989
6
    tensor_arena->vt_sizes = (size_t*)ccmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
3990
1.30k
    for (i = 0; i < tensor_arena->vt_tensor_size; 
i++1.30k
)
3991
1.30k
      if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]1.05k
)
3992
796
      {
3993
796
        ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
3994
796
        if (CCV_IS_TENSOR_MULTIVIEW(tensor))
3995
796
        {
3996
0
          ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
3997
0
          while (CCV_IS_TENSOR_MULTIVIEW(mv))
3998
0
            mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)[0]);
3999
0
          tensor = (ccv_nnc_tensor_t*)mv;
4000
0
        }
4001
796
        tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4002
796
      }
4003
6
  }
4004
2.20k
  int flag = 0;
4005
19.2k
  for (i = 0; !flag && 
i < tensor_arena->vt_tensor_size19.2k
;
i++17.0k
)
4006
17.0k
    if (tensor_arena->vt_tensors[i] && 
!tensor_arena->vt_alias_refs[i]14.6k
)
4007
13.6k
    {
4008
13.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4009
13.6k
      flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(symbol_info->info));
4010
13.6k
    }
4011
2.20k
  if (flag)
4012
2
    return -1;
4013
19.2k
  
for (i = 0; 2.20k
i < tensor_arena->vt_tensor_size;
i++17.0k
)
4014
17.0k
    if (tensor_arena->vt_tensors[i])
4015
14.6k
    {
4016
14.6k
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i);
4017
14.6k
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4018
14.6k
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
4019
14.6k
      {
4020
0
        assert(!tensor_arena->vt_alias_refs[i]);
4021
0
        _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4022
14.6k
      } else if (!tensor_arena->vt_alias_refs[i])
4023
13.6k
        tensor->info = symbol_info->info;
4024
1.00k
      else {
4025
1.00k
        off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->inc, symbol_info->ofs);
4026
1.00k
        tensor->info = symbol_info->info;
4027
1.00k
        const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4028
1.00k
        tensor->data.u8 = tensor_arena->vt_tensors[alias_ref]->data.u8 + off;
4029
1.00k
        if (CCV_IS_TENSOR_VIEW(tensor))
4030
1.00k
          
((ccv_nnc_tensor_view_t*)tensor)->off = off0
;
4031
1.00k
      }
4032
14.6k
    }
4033
2.20k
  // Should handle sub_tensor_arena, don't do that at the moment.
4034
2.20k
  assert(!graph->sub_graphs);
4035
2.20k
  return 0;
4036
2.20k
}
4037
4038
void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4039
2.20k
{
4040
2.20k
  assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size);
4041
2.20k
  int i;
4042
9.01k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++6.81k
)
4043
6.81k
  {
4044
6.81k
    const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4045
6.81k
    if (graph_exec.d < 0)
4046
2.40k
      continue;
4047
4.40k
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
4048
4.40k
    ccv_nnc_graph_exec_set(graph, graph_exec, symbol_info->cmd);
4049
4.40k
  }
4050
2.20k
}
4051
4052
void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4053
6.24k
{
4054
6.24k
  int i;
4055
22.3k
  for (i = 0; i < tensor_arena->buffer_size; 
i++16.0k
)
4056
16.0k
  {
4057
16.0k
    if (!tensor_arena->buffers[i].ptr)
4058
248
      continue;
4059
15.8k
    const int buffer_type = tensor_arena->buffers[i].type;;
4060
15.8k
    const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
4061
15.8k
#ifdef HAVE_CUDA
4062
15.8k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
4063
15.8k
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
4064
2.41k
    {
4065
2.41k
      if (tensor_arena->allocator.isa && 
tensor_arena->allocator.isa->free260
)
4066
260
        tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4067
2.15k
      else
4068
2.15k
        cufree(device_id, tensor_arena->buffers[i].ptr);
4069
13.3k
    } else {
4070
13.3k
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4071
13.3k
      if (tensor_arena->buffers[i].pin_mem)
4072
11
        cuhostfree(tensor_arena->buffers[i].ptr);
4073
13.3k
      else
4074
13.3k
        
ccfree13.3k
(tensor_arena->buffers[i].ptr)13.3k
;
4075
13.3k
    }
4076
#else
4077
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
4078
    ccfree(tensor_arena->buffers[i].ptr);
4079
#endif
4080
15.8k
    tensor_arena->buffers[i].ptr = 0;
4081
15.8k
  }
4082
6.24k
}
4083
4084
void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4085
6.04k
{
4086
6.04k
  ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4087
6.04k
  _ccv_nnc_tensor_arena_free(tensor_arena);
4088
6.04k
}
4089
4090
void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4091
6.09k
{
4092
6.09k
  int i;
4093
6.14k
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++50
)
4094
50
    if (graph_exec_arena->sub_arenas[i])
4095
49
      ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4096
6.09k
  ccfree(graph_exec_arena);
4097
6.09k
}