Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_symbolic_graph_compile.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#ifdef HAVE_CUDA
6
#include "gpu/ccv_nnc_compat.h"
7
#endif
8
#include "_ccv_nnc_graph.h"
9
#include "_ccv_nnc_symbolic_graph.h"
10
11
#pragma mark - Level-3 API
12
13
typedef struct {
14
  int flags;
15
  int type;
16
  int pin_mem; // This memory need to be pinned.
17
  int ref; // Reference to another tensor block. Start with 1.
18
  int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
19
  int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
20
  int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
21
  ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
22
  uint64_t size; // The size of the tensor expected.
23
  int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
24
  ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
25
  ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
26
  ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
27
} ccv_nnc_tensor_block_t; // Tensor Arena Block
28
29
1.03M
#define IS_PRIMARY_COMPANION(idx, block) ((idx) < (uint32_t)((block).companion_ref - 1))
30
31
enum {
32
  UNASSIGNED = 0x1,
33
  ALIAS = 0x2,
34
  READ_ONLY = 0x4,
35
  WRITE_ONLY = 0x8,
36
  READ_WRITE = 0xc,
37
  ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
38
  UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
39
  UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
40
};
41
42
#define TENSOR_EXPECT_ORDINARY(t) ((t.flags & 0x3) == 0)
43
#define TENSOR_EXPECT_SET_ORDINARY(t) (t.flags = (t.flags & ~0x3))
44
10.9M
#define TENSOR_EXPECT_UNASSIGNED(t) ((t.flags & 0x3) == UNASSIGNED)
45
2.02k
#define TENSOR_EXPECT_SET_UNASSIGNED(t) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
46
3
#define TENSOR_EXPECT_UNSET_UNASSIGNED(t) (t.flags = (t.flags & ~0x1))
47
21.7M
#define TENSOR_EXPECT_ALIAS(t) ((t.flags & 0x3) == ALIAS)
48
19.5M
#define TENSOR_EXPECT_COMPUTABLE(t) (
!10.8M
TENSOR_EXPECT_ALIAS10.8M
(t) &&
!10.7M
TENSOR_EXPECT_UNASSIGNED10.7M
(t))
49
8.55k
#define TENSOR_READ_WRITE(t) (t.flags & 0xc)
50
2.10k
#define TENSOR_SET_READ_WRITE(t, rw) (t.flags = ((t.flags & ~0xc) | rw))
51
96
#define TENSOR_SET_ANONYMOUS(t) (t.flags = (t.flags & ~0x10 | ANONYMOUS))
52
#define TENSOR_IS_ANONYMOUS(t) (t.flags & ANONYMOUS)
53
180
#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
54
6.36k
#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t) (t.flags & UNFOLDABLE_AS_INPUT)
55
116
#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
56
4.28k
#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t) (t.flags & UNFOLDABLE_AS_OUTPUT)
57
58
26.4k
#define TENSOR_REQUIRE_INIT(flags) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || 
((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)26.3k
)
59
60
// Holds additional information about the exe nodes.
61
typedef struct {
62
  int flags;
63
} ccv_nnc_graph_exec_flag_t;
64
65
enum {
66
  CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
67
};
68
69
typedef struct {
70
  int index;
71
  int companion; // The companion node index (the node that doesn't interfere with current one).
72
  int oc;
73
  int type;
74
  uint64_t size;
75
} ccv_nnc_tensor_opt_t;
76
77
// We first sort the same type together (because they won't be reused at all.
78
// And then we sort by size, after that, sort by oc.
79
75.5k
#define more_than(i1, i2, aux) (((i1).size > (i2).size) || 
(51.7k
(i1).size == (i2).size51.7k
&&
(i1).oc >= (i2).oc35.9k
))
80
75.5k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)
81
#undef more_than
82
83
// If b has items overlap with a, a is still after b (inclusive).
84
static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
85
0
{
86
0
  assert(a);
87
0
  assert(b);
88
0
  int x, y;
89
0
  for (x = 0; x < b->rnum; x++)
90
0
  {
91
0
    const int p = *(int*)ccv_array_get(b, x);
92
0
    int flag = 0;
93
0
    // In extreme cases where a is a superset of b, then a is still after b, we are good.
94
0
    for (y = 0; !flag && y < a->rnum; y++)
95
0
    {
96
0
      const int q = *(int*)ccv_array_get(a, y);
97
0
      flag = (p == q);
98
0
    }
99
0
    if (!flag)
100
0
      for (y = 0; y < a->rnum; y++)
101
0
      {
102
0
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y), p);
103
0
        if (!cell.i32 || cell.i32[0] == 0)
104
0
          return 0;
105
0
      }
106
0
  }
107
0
  // If b->rnum == 0, a is after b for sure.
108
0
  // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
109
0
  // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
110
0
  return (a->rnum > 0 || b->rnum == 0);
111
0
}
112
113
static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
114
480k
{
115
480k
  assert(a);
116
480k
  assert(b);
117
480k
  int x, y, max_hop = 0;
118
502k
  for (x = 0; x < a->rnum; 
x++22.4k
)
119
502k
    
for (y = 0; 480k
y < b->rnum;
y++22.4k
)
120
480k
    {
121
480k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x), *(int*)ccv_array_get(b, y));
122
480k
      if (!cell.i32 || 
cell.i32[0] == 022.4k
)
123
457k
        return 0;
124
22.4k
      max_hop = ccv_max(cell.i32[0], max_hop);
125
22.4k
    }
126
480k
  // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
127
480k
  // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
128
480k
  
return max_hop22.3k
;
129
480k
}
130
131
// If every a's head is deterministically after b's tail
132
static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
133
480k
{
134
480k
  return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
135
480k
}
136
137
typedef struct {
138
  ccv_array_t** alloc_dep;
139
  int vt_block_size;
140
  int buffer_size;
141
  int block_size;
142
  int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
143
  struct {
144
    int type; // The type from tensor blocks.
145
    int pin_mem; // Whether this is pinned memory.
146
    int flags; // The flags (currently for READ_ONLY or not).
147
    uint64_t size; // The size of the buffer allocated.
148
    int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
149
    ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
150
  }* buffers;
151
  struct {
152
    int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
153
    int block_ref; // A reference to which block in the given tensor_block to use.
154
    uint64_t offset; // The offset of this block.
155
  }* blocks;
156
} ccv_nnc_tensor_alloc_prep_t;
157
158
typedef struct ccv_nnc_symbolic_graph_prep_s {
159
  int flags;
160
  int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
161
  int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
162
  int exec_idx;
163
  int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
164
  int tensor_symbol_info_size;
165
  int exec_symbol_info_size;
166
  int tensor_block_size;
167
  int sub_prep_size;
168
  ccv_nnc_tensor_block_t* tensor_blocks;
169
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
170
  ccv_nnc_graph_exec_flag_t* exec_flags;
171
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
172
  int* dup_tensor_block_ref;
173
  ccv_nnc_graph_visit_t* visit;
174
  ccv_nnc_tensor_alloc_prep_t* alloc_prep;
175
  struct ccv_nnc_symbolic_graph_prep_s* p;
176
  struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
177
  // Structures that don't require to be freed after deallocation.
178
  const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
179
  ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
180
  ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
181
  ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
182
} ccv_nnc_symbolic_graph_prep_t;
183
184
static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
185
1.17k
{
186
1.17k
  // Compute how many dis-continuous buffers are needed.
187
1.17k
  // We prefer to have several dis-continuous buffers instead of one big buffer because
188
1.17k
  // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
189
1.17k
  // to fully utilize memory.
190
1.17k
  int i, j, k;
191
1.17k
  ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloc(tensor_block_size, sizeof(ccv_array_t*));
192
1.17k
  int allocable_tensor_size = 0, available_tensor_size = 0;
193
21.5k
  for (i = 0; i < tensor_block_size; 
i++20.3k
)
194
20.3k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
195
20.3k
    {
196
8.31k
      // Tensors that we need the header info.
197
8.31k
      ++available_tensor_size;
198
8.31k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
199
8.31k
        // Tensors that we actually need to allocate (exclude the alias).
200
8.31k
        
++allocable_tensor_size8.18k
;
201
8.31k
    }
202
1.17k
  ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
203
1.17k
  ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
204
1.17k
  ccv_sparse_matrix_t* const tensor_itf = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_8U | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
205
1.17k
  // Overlap count.
206
21.5k
  for (i = 0; i < tensor_block_size; 
i++20.3k
)
207
1.40M
    
for (j = i + 1; 20.3k
j < tensor_block_size;
j++1.38M
)
208
1.38M
      if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && 
TENSOR_EXPECT_COMPUTABLE503k
(tensor_blocks[j]))
209
1.38M
      {
210
240k
        // Check to see if they interfere (default to yes).
211
240k
        // If any of the i's head is deterministically later than j's tail
212
240k
        // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
213
240k
        const uint8_t one = 1;
214
240k
        const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
215
240k
        if (i_hop_j > 0)
216
218
        {
217
218
          ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
218
218
          ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
219
218
        }
220
240k
        const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
221
240k
        if (j_hop_i > 0)
222
22.1k
        {
223
22.1k
          ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
224
22.1k
          ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
225
22.1k
        }
226
240k
        // It cannot be that both i can hop to j can j can hop to i.
227
240k
        assert(!(i_hop_j > 0 && j_hop_i > 0));
228
240k
        if (!i_hop_j && 
!j_hop_i239k
)
229
217k
          ccv_set_sparse_matrix_cell(tensor_itf, i, j, &one);
230
240k
      }
231
1.17k
  int* const oc = (int*)cccalloc(tensor_block_size, sizeof(int));
232
21.5k
  for (i = 0; i < tensor_block_size; 
i++20.3k
)
233
2.80M
    
for (j = 0; 20.3k
j < tensor_block_size;
j++2.78M
)
234
2.78M
      // If these two tensors are still alive, analyze them.
235
2.78M
      if (i != j && 
TENSOR_EXPECT_COMPUTABLE2.76M
(tensor_blocks[i]) &&
TENSOR_EXPECT_COMPUTABLE1.15M
(tensor_blocks[j]))
236
2.78M
      {
237
480k
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(i, j), ccv_max(i, j));
238
480k
        // If their life time overlaps, compute how many tensors it overlap.
239
480k
        if (cell.u8 && 
cell.u8[0] == 1435k
)
240
435k
          ++oc[i];
241
480k
      }
242
1.17k
  int* const buf = (int*)ccmalloc(sizeof(int) * tensor_block_size);
243
1.17k
  int* const assigned = (int*)cccalloc(tensor_block_size, sizeof(int));
244
1.17k
  uint64_t* const allocated_offset = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
245
1.17k
  uint64_t* const allocated_size = (uint64_t*)cccalloc(tensor_block_size, sizeof(uint64_t));
246
1.17k
  int num_assigned = 0; 
247
1.17k
  // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
248
1.17k
  // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
249
1.17k
  // The first channel denotes the bytes available for allocation,
250
1.17k
  // the second channel denotes the offset available for the allocation,
251
1.17k
  ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
252
1.17k
  ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
253
9.08k
  for (j = 0; j < allocable_tensor_size;)
254
7.90k
  {
255
7.90k
    // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
256
7.90k
    int max_oc = 0;
257
7.90k
    uint64_t max_size = 0;
258
7.90k
    ccv_array_clear(opt);
259
7.90k
    int current_type = 0; // Deal with one type at a time.
260
984k
    for (i = 0; i < tensor_block_size; 
i++976k
)
261
976k
      if (oc[i] >= max_oc &&
262
976k
        
TENSOR_EXPECT_COMPUTABLE455k
(tensor_blocks[i]) &&
!assigned[i]247k
&&
263
976k
        
IS_PRIMARY_COMPANION54.8k
(i, tensor_blocks[i]) &&
264
976k
        
(54.8k
!current_type54.8k
||
tensor_blocks[i].type == current_type46.9k
))
265
25.1k
      {
266
25.1k
        ccv_nnc_tensor_opt_t a = {
267
25.1k
          .size = tensor_blocks[i].size,
268
25.1k
          .index = i,
269
25.1k
          .companion = -1, // If already have a designated companion, use that.
270
25.1k
          .oc = oc[i],
271
25.1k
          .type = tensor_blocks[i].type,
272
25.1k
        };
273
25.1k
        assert(a.type);
274
25.1k
        current_type = a.type; // Now we now the primary type we should deal with.
275
25.1k
        if (tensor_blocks[i].companion_ref)
276
29
        {
277
29
          const int companion_ref = tensor_blocks[i].companion_ref - 1;
278
29
          a.size = ccv_max(a.size, tensor_blocks[companion_ref].size);
279
29
          a.oc += oc[companion_ref];
280
29
        }
281
25.1k
        // In case we have a tie, take them all in the array.
282
25.1k
        if (a.oc > max_oc || 
(16.5k
a.oc == max_oc16.5k
&&
a.size > max_size16.5k
))
283
13.5k
          ccv_array_clear(opt), max_oc = a.oc, max_size = a.size;
284
25.1k
        ccv_array_push(opt, &a);
285
25.1k
      }
286
7.90k
    assert(opt->rnum > 0);
287
7.90k
    // Go through opt array, find all tensors that doesn't interfere with it, and have tensor size larger than it.
288
7.90k
    // Push them with the "companion" into the opt array as well.
289
7.90k
    const int rnum = opt->rnum;
290
24.2k
    for (i = 0; i < rnum; 
i++16.3k
)
291
16.3k
    {
292
16.3k
      // Copy it out, because after insertion, it may hold invalid pointer.
293
16.3k
      ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
294
16.3k
      assert(a.companion == -1);
295
16.3k
      const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
296
3.59M
      for (k = 0; k < tensor_block_size; 
k++3.57M
)
297
3.57M
        // Find non-overlapping tensor that has larger size (of course, is unassigned and is not one with designated companion).
298
3.57M
        if (k != a.index && 
!tensor_blocks[k].companion_ref3.55M
&&
299
3.57M
          
TENSOR_EXPECT_COMPUTABLE3.55M
(tensor_blocks[k]) &&
!assigned[k]1.49M
&&
300
3.57M
          
tensor_blocks[k].size > a.size896k
&&
tensor_blocks[k].type == a.type571k
)
301
170k
        {
302
170k
          ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(a.index, k), ccv_max(a.index, k));
303
170k
          // Good, push to opt array.
304
170k
          if (cell.u8 && 
cell.u8[0] == 1161k
)
305
161k
            continue;
306
9.23k
          if (companion_ref >= 0)
307
0
          {
308
0
            assert(companion_ref != k);
309
0
            // Have to make sure k doesn't interfere with the designated companion as well.
310
0
            ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(companion_ref, k), ccv_max(companion_ref, k));
311
0
            if (cell.u8 && cell.u8[0] == 1)
312
0
              continue;
313
9.23k
          }
314
9.23k
          ccv_nnc_tensor_opt_t b = a;
315
9.23k
          b.companion = k;
316
9.23k
          b.oc = a.oc + oc[k];
317
9.23k
          b.size = tensor_blocks[k].size;
318
9.23k
          ccv_array_push(opt, &b);
319
9.23k
        }
320
16.3k
    }
321
7.90k
    // Order opt array by the oc because type and size should be equal at this point.
322
7.90k
    _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
323
7.90k
    // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
324
7.90k
    int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep->rows * 3;
325
7.90k
    uint64_t min_val[2] = {
326
7.90k
      0, 0
327
7.90k
    };
328
25.5k
    for (i = 0; i < opt->rnum; 
i++17.5k
)
329
21.2k
    {
330
21.2k
      ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i);
331
21.2k
      // Now, determine the order between a and c. After this, we can always check whether y
332
21.2k
      // can hop to the earliest one and if the latest one can hop to x.
333
21.2k
      // The earliest one will be called p and the latest one will be called q.
334
21.2k
      int p = a.index;
335
21.2k
      int q = a.index;
336
21.2k
      if (a.companion >= 0)
337
6.75k
      {
338
6.75k
        const ccv_numeric_data_t a_hop_c = ccv_get_sparse_matrix_cell(tensor_dt, a.companion, a.index);
339
6.75k
        const ccv_numeric_data_t c_hop_a = ccv_get_sparse_matrix_cell(tensor_dt, a.index, a.companion);
340
6.75k
        assert((a_hop_c.i32 && a_hop_c.i32[0] > 0 && (c_hop_a.i32 == 0 || c_hop_a.i32[0] == 0)) ||
341
6.75k
            ((a_hop_c.i32 == 0 || a_hop_c.i32[0] == 0) && c_hop_a.i32 && c_hop_a.i32[0] > 0));
342
6.75k
        if (a_hop_c.i32 && 
a_hop_c.i32[0] > 02.50k
)
343
2.50k
          q = a.companion;
344
4.24k
        else
345
4.24k
          p = a.companion;
346
6.75k
      }
347
21.2k
      if (tensor_blocks[a.index].companion_ref)
348
28
      {
349
28
        const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
350
28
        assert(a.companion != companion_ref);
351
28
        const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
352
28
        if (b_hop_p.i32 && 
b_hop_p.i32[0] > 02
)
353
2
          p = companion_ref;
354
26
        else {
355
26
          const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
356
26
          if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
357
26
            q = companion_ref;
358
0
          else { // Otherwise, b is in between p and q.
359
0
            const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
360
0
            const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
361
0
            assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0);
362
0
          }
363
26
        }
364
28
      }
365
21.2k
      assert(tensor_blocks[q].type == tensor_blocks[p].type);
366
21.2k
      const int type = tensor_blocks[p].type;
367
21.2k
      // y is always earlier than x, but this is hard to assert now.
368
21.2k
      // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
369
21.2k
      // Thus, the hop between y and x (through a) should be smallest ones.
370
21.2k
      // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
371
21.2k
      // out of q. For these nodes, we try to verify whether they form a connection (by checking against
372
21.2k
      // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
373
21.2k
      int y_size = 0;
374
21.2k
      int* const y_buf = buf;
375
63.3k
#define for_block(y, val) do { \
376
63.3k
        if (((int*)val)[0] > 0 && assigned[y] && 
tensor_blocks[y].type == type32.6k
) \
377
63.3k
          
y_buf[y_size++] = y + 132.4k
; \
378
63.3k
      } while(0)
379
21.2k
      ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
380
21.2k
      if (y_vector)
381
63.3k
        
CCV_SPARSE_VECTOR_FOREACH10.4k
(tensor_dt, y_vector, for_block);
382
21.2k
#undef for_block
383
21.2k
      assert(y_size <= tensor_block_size);
384
21.2k
      int x_size = 0;
385
21.2k
      int* const x_buf = buf + y_size;
386
51.9k
#define for_block(x, val) do { \
387
51.9k
        if (((int*)val)[0] > 0 && assigned[x] && 
tensor_blocks[x].type == type23.9k
) \
388
51.9k
          
x_buf[x_size++] = x + 123.8k
; \
389
51.9k
      } while(0)
390
21.2k
      ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
391
21.2k
      if (x_vector)
392
51.9k
        
CCV_SPARSE_VECTOR_FOREACH8.07k
(tensor_df, x_vector, for_block);
393
21.2k
#undef for_block
394
21.2k
      assert(y_size + x_size <= tensor_block_size);
395
21.2k
      int x, y;
396
53.7k
      for (y = 0; y < y_size; 
y++32.4k
)
397
32.4k
      {
398
32.4k
        const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y], tensor_block_size + 1);
399
32.4k
        if (val.u64 && 
val.u64[0] >= a.size8.17k
)
400
1.90k
        {
401
1.90k
          const ccv_numeric_data_t y_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, y_buf[y] - 1);
402
1.90k
          assert(y_hop_p.i32 && y_hop_p.i32[0] > 0);
403
1.90k
          const int hop = exec_dep->rows + y_hop_p.i32[0];
404
1.90k
          if (hop < min_hop)
405
1.42k
            min_y = y_buf[y], min_x = tensor_block_size + 1, min_hop = hop,
406
1.42k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
407
1.90k
        }
408
32.4k
      }
409
45.0k
      
for (x = 0; 21.2k
x < x_size;
x++23.8k
)
410
23.8k
      {
411
23.8k
        const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x]);
412
23.8k
        if (val.u64 && 
val.u64[0] >= a.size13.9k
)
413
3.05k
        {
414
3.05k
          const ccv_numeric_data_t q_hop_x = ccv_get_sparse_matrix_cell(tensor_dt, x_buf[x] - 1, q);
415
3.05k
          assert(q_hop_x.i32 && q_hop_x.i32[0] > 0);
416
3.05k
          const int hop = exec_dep->rows + q_hop_x.i32[0];
417
3.05k
          if (hop < min_hop)
418
2.44k
            min_y = 0, min_x = x_buf[x], min_hop = hop,
419
2.44k
              min_val[0] = val.u64[0], min_val[1] = val.u64[1];
420
3.05k
        }
421
23.8k
      }
422
53.7k
      
for (y = 0; 21.2k
y < y_size;
y++32.4k
)
423
32.4k
      {
424
32.4k
        ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y]);
425
32.4k
        if (y_vector)
426
61.2k
          
for (x = 0; 32.4k
x < x_size;
x++28.7k
)
427
28.7k
          {
428
28.7k
            const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x]);
429
28.7k
            if (val.u64 && 
val.u64[0] >= a.size512
)
430
28
            {
431
28
              const ccv_numeric_data_t y_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, y_buf[y] - 1);
432
28
              const ccv_numeric_data_t q_hop_x = ccv_get_sparse_matrix_cell(tensor_dt, x_buf[x] - 1, q);
433
28
              assert(y_hop_p.i32 && y_hop_p.i32[0] > 0);
434
28
              assert(q_hop_x.i32 && q_hop_x.i32[0] > 0);
435
28
              const int hop = y_hop_p.i32[0] + q_hop_x.i32[0];
436
28
              if (hop < min_hop)
437
17
                min_y = y_buf[y], min_x = x_buf[x], min_hop = hop,
438
17
                  min_val[0] = val.u64[0], min_val[1] = val.u64[1];
439
28
            }
440
28.7k
          }
441
32.4k
      }
442
21.2k
      // If I found a place, stop, and exit.
443
21.2k
      if (min_y > 0 || 
min_x < tensor_block_size + 119.9k
)
444
3.64k
      {
445
3.64k
        min_i = i;
446
3.64k
        break;
447
3.64k
      }
448
21.2k
    }
449
7.90k
    // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
450
7.90k
    // and default to largest size available.
451
7.90k
    ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i));
452
7.90k
    if (min_i == -1)
453
4.26k
    {
454
4.26k
      allocated_size[num_assigned] = a.size;
455
4.26k
      ++num_assigned;
456
4.26k
    }
457
7.90k
    int assign_group = num_assigned;
458
7.90k
    if (min_y > 0)
459
1.28k
    {
460
1.28k
      assign_group = assigned[min_y - 1];
461
1.28k
      // The y and x should belong to the same assigned group.
462
1.28k
      assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group);
463
6.61k
    } else if (min_x < tensor_block_size + 1)
464
2.35k
      assign_group = assigned[min_x - 1];
465
7.90k
    // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
466
7.90k
    if (min_y != 0 || 
min_x != tensor_block_size + 16.61k
)
467
3.64k
    {
468
3.64k
      uint64_t val[2] = {
469
3.64k
        min_val[0], min_val[1]
470
3.64k
      };
471
3.64k
      assert(val[0] >= a.size);
472
3.64k
      val[0] -= a.size;
473
3.64k
      val[1] = val[1] + a.size; // Move the offset to the next one.
474
3.64k
      ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
475
3.64k
    }
476
7.90k
    int strings[3];
477
7.90k
    strings[0] = a.index + 1;
478
7.90k
    int string_size = 1;
479
7.90k
    // Assign out companion as well.
480
7.90k
    if (a.companion >= 0)
481
257
    {
482
257
      const ccv_numeric_data_t a_hop_c = ccv_get_sparse_matrix_cell(tensor_dt, a.companion, a.index);
483
257
      if (a_hop_c.i32 && 
a_hop_c.i32[0] > 0163
)
484
163
        strings[1] = a.companion + 1;
485
94
      else {
486
94
        strings[1] = strings[0];
487
94
        strings[0] = a.companion + 1;
488
94
      }
489
257
      ++string_size;
490
257
    }
491
7.90k
    // Assign out designated companion if it exist.
492
7.90k
    if (tensor_blocks[a.index].companion_ref && 
a.companion != tensor_blocks[a.index].companion_ref - 120
)
493
20
    {
494
20
      const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
495
20
      assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type);
496
20
      const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
497
20
      if (b_hop_p.i32 && 
b_hop_p.i32[0] > 02
)
498
2
      {
499
4
        for (i = 0; i < string_size; 
i++2
)
500
2
          strings[i + 1] = strings[i];
501
2
        strings[0] = companion_ref + 1;
502
18
      } else {
503
18
        const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
504
18
        if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
505
18
          strings[string_size] = companion_ref + 1;
506
0
        else {
507
0
          // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
508
0
          assert(string_size == 2);
509
0
          strings[2] = strings[1];
510
0
          strings[1] = companion_ref + 1;
511
0
        }
512
18
      }
513
20
      ++string_size;
514
20
    }
515
7.90k
    // Assign out and update oc.
516
16.0k
    
for (i = 0; 7.90k
i < string_size;
i++8.18k
)
517
8.18k
    {
518
8.18k
      const int index = strings[i] - 1;
519
8.18k
      // Assign out the selected one.
520
8.18k
      assigned[index] = assign_group;
521
8.18k
      // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
522
8.18k
      allocated_offset[index] = min_val[1];
523
1.16M
      for (k = 0; k < tensor_block_size; 
k++1.15M
)
524
1.15M
        if (!assigned[k] && 
TENSOR_EXPECT_COMPUTABLE910k
(tensor_blocks[k]))
525
1.15M
        {
526
240k
          ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(tensor_itf, ccv_min(k, index), ccv_max(k, index));
527
240k
          if (cell.u8 && 
cell.u8[0] == 1217k
)
528
217k
            --oc[k];
529
240k
        }
530
8.18k
    }
531
7.90k
    uint64_t val[2] = {
532
7.90k
      a.size, min_val[1]
533
7.90k
    };
534
7.90k
    uint64_t consumed_size = 0;
535
7.90k
    // Go over from min_y to string_size (excluding min_x).
536
8.07k
    for (i = 0; i < string_size; 
i++163
)
537
8.07k
    {
538
8.07k
      const uint64_t size = tensor_blocks[strings[i] - 1].size;
539
8.07k
      assert(size <= a.size);
540
8.07k
      // Update consumed size if it is bigger than "size".
541
8.07k
      if (size > consumed_size)
542
8.07k
      {
543
8.07k
        val[0] = size - consumed_size;
544
8.07k
        ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
545
8.07k
        consumed_size = size;
546
8.07k
        val[1] = min_val[1] + consumed_size;
547
8.07k
      }
548
8.07k
      // If it consumed all the flow, break out.
549
8.07k
      if (consumed_size == a.size)
550
7.90k
        break;
551
8.07k
    }
552
16.0k
    
for (i = 0; 7.90k
i < string_size;
i++8.18k
)
553
8.18k
    {
554
8.18k
      const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
555
8.18k
      uint64_t val[2] = {
556
8.18k
        i_size, min_val[1]
557
8.18k
      };
558
8.18k
      uint64_t consumed_size = 0;
559
8.27k
      for (k = i + 1; k < string_size; 
k++94
)
560
277
      {
561
277
        const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size);
562
277
        // Update consumed size if it is bigger than "size".
563
277
        if (size > consumed_size)
564
277
        {
565
277
          val[0] = size - consumed_size;
566
277
          ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
567
277
          consumed_size = size;
568
277
          val[1] = min_val[1] + consumed_size;
569
277
        }
570
277
        // If it consumed all the flow, break out.
571
277
        if (consumed_size == i_size)
572
183
          break;
573
277
      }
574
8.18k
      val[0] = i_size - consumed_size;
575
8.18k
      // Still have residual, flow it to min_x.
576
8.18k
      if (val[0] > 0)
577
8.00k
        ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
578
8.18k
    }
579
7.90k
    j += string_size;
580
7.90k
  }
581
1.17k
  ccfree(buf);
582
1.17k
  ccv_array_free(opt);
583
1.17k
  ccv_matrix_free(tensor_df);
584
1.17k
  ccv_matrix_free(tensor_dt);
585
1.17k
  ccv_matrix_free(tensor_itf);
586
16.3k
#define for_block(y, x, val) do { \
587
16.3k
    if (((uint64_t*)val)[0] > 0 && 
y > 013.2k
&&
x < tensor_block_size + 18.51k
) \
588
16.3k
    { \
589
3.94k
      if (!alloc_dep[x - 1]) \
590
3.94k
        
alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0)3.66k
; \
591
3.94k
      ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
592
3.94k
    } \
593
16.3k
  } while (0)
594
16.3k
  
CCV_SPARSE_FOREACH1.17k
(alloc, for_block);
595
1.17k
#undef for_block
596
1.17k
  ccv_matrix_free(alloc);
597
1.17k
  ccfree(oc);
598
1.17k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
599
1.17k
  alloc_prep->alloc_dep = alloc_dep;
600
1.17k
  alloc_prep->vt_block_size = tensor_block_size;
601
1.17k
  alloc_prep->buffer_size = num_assigned;
602
1.17k
  alloc_prep->block_size = available_tensor_size;
603
1.17k
  alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
604
1.17k
  alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
605
1.17k
  alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
606
1.17k
  memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
607
5.43k
  for (i = 0; i < num_assigned; 
i++4.26k
)
608
4.26k
    alloc_prep->buffers[i].size = allocated_size[i];
609
1.17k
  ccfree(allocated_size);
610
1.17k
  j = 0;
611
1.17k
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
612
21.5k
  for (i = 0; i < tensor_block_size; 
i++20.3k
)
613
20.3k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
614
20.3k
    {
615
8.31k
      alloc_prep->blocks[j].block_ref = i;
616
8.31k
      if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
617
8.31k
      {
618
8.18k
        alloc_prep->vt_blocks[i] = j;
619
8.18k
        // Also, set its allocations.
620
8.18k
        assert(assigned[i] > 0);
621
8.18k
        const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
622
8.18k
        alloc_prep->blocks[j].offset = allocated_offset[i];
623
8.18k
        if (!alloc_prep->buffers[buffer_ref].type)
624
4.26k
          alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
625
8.18k
        alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || 
tensor_blocks[i].pin_mem8.17k
;
626
8.18k
        alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i]);
627
8.18k
        assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size);
628
8.18k
      } else {
629
127
        alloc_prep->vt_blocks[i] = -1;
630
127
        alloc_prep->blocks[j].buffer_ref = -1;
631
127
        alloc_prep->blocks[j].offset = 0;
632
127
      }
633
8.31k
      ++j;
634
8.31k
    } else
635
12.0k
      alloc_prep->vt_blocks[i] = -1;
636
1.17k
  ccfree(allocated_offset);
637
1.17k
  ccfree(assigned);
638
1.17k
  return alloc_prep;
639
1.17k
}
640
641
static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
642
1.17k
{
643
1.17k
  int i;
644
21.5k
  for (i = 0; i < alloc_prep->vt_block_size; 
i++20.3k
)
645
20.3k
    if (alloc_prep->alloc_dep[i])
646
3.66k
      ccv_array_free(alloc_prep->alloc_dep[i]);
647
5.43k
  for (i = 0; i < alloc_prep->buffer_size; 
i++4.26k
)
648
4.26k
    if (alloc_prep->buffers[i].dup_p_refs)
649
13
      ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
650
1.17k
  ccfree(alloc_prep->alloc_dep);
651
1.17k
  ccfree(alloc_prep);
652
1.17k
}
653
654
// Simple allocator from ccv_array_t.
655
static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
656
18.2k
{
657
18.2k
  int pos = tensor_metadata->rnum;
658
18.2k
  int rsize = (size + 15) / 16;
659
18.2k
  ccv_array_resize(tensor_metadata, pos + rsize);
660
18.2k
  return (pos << 1) + 1;
661
18.2k
}
662
663
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
664
39.2k
{
665
39.2k
  assert((pos >> 1) < tensor_metadata->rnum);
666
39.2k
  return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1);
667
39.2k
}
668
669
20.8k
#define CCV_NNC_IS_METADATA_POS(ptr) ((uintptr_t)(
ptr277
) & 1)
670
671
static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
672
20.3k
{
673
20.3k
  // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
674
20.3k
  if (!CCV_NNC_IS_METADATA_POS(vt_tensor))
675
20.3k
    
return vt_tensor0
;
676
20.3k
  ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
677
20.3k
  if (tensor->alias_ref && 
CCV_NNC_IS_METADATA_POS87
(tensor->alias_ref))
678
20.3k
  {
679
67
    const int alias_ref = tensor->alias_ref;
680
67
    tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
681
67
    _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
682
67
  }
683
20.3k
  if (CCV_IS_TENSOR_MULTIVIEW(tensor))
684
20.3k
  {
685
75
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
686
75
    int i;
687
75
    const int count = mv->kind + mv->repeat;
688
240
    for (i = 0; i < count; 
i++165
)
689
165
    {
690
165
      if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
691
165
      {
692
147
        const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i];
693
147
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
694
147
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
695
147
      }
696
165
    }
697
75
    // No need to recursively do parent pointer, otherwise we are in deep rewire.
698
75
    if (mv->p && 
CCV_NNC_IS_METADATA_POS9
(mv->p))
699
75
      
mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p)9
;
700
75
    if (mv->sp)
701
65
      
for (i = 0; 28
i < mv->sp->rnum;
i++37
)
702
37
      {
703
37
        ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
704
37
        if (CCV_NNC_IS_METADATA_POS(*tensor))
705
37
        {
706
30
          const int pos = (int)(intptr_t)*tensor;
707
30
          *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
708
30
          assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
709
30
          _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
710
30
        }
711
37
      }
712
75
  }
713
20.3k
  return tensor;
714
20.3k
}
715
716
typedef struct {
717
  const uint8_t* ptr;
718
  int pos;
719
} ccv_nnc_tensor_block_pos_t;
720
721
static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
722
114
{
723
114
  int i;
724
114
  int unref_block_ref = block_ref;
725
117
  while (prep->tensor_blocks[unref_block_ref].ref)
726
3
    unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
727
114
  int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
728
114
  assert(vt_ref >= 0);
729
114
  assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref);
730
114
  const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
731
114
  uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
732
114
  int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
733
114
  for (i = idx - 1; i >= 0; 
i--0
)
734
114
  {
735
114
    assert(p_ref >= 0);
736
114
    const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
737
114
    const int unroll_count = graph_prep->unroll_count;
738
114
    if (ch[i]) // Prefer the dup side of things.
739
12
      p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
740
114
    int unref_p_ref = p_ref;
741
114
    while (graph_prep->tensor_blocks[unref_p_ref].ref)
742
0
      unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
743
114
    vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
744
114
    const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
745
114
    offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
746
114
    // If the buffer already exists, prefer that.
747
114
    const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
748
114
    if (ptr)
749
114
    {
750
114
      // If I have any remaining path that is not covered from 0, I cannot possibly
751
114
      // have any pointer from buffer (that can only happen if it is not dup).
752
138
      for (--i; i >= 0; 
i--24
)
753
24
        if (ch[i] != 0)
754
0
          return 0;
755
114
      // Try to find the created tensor block pos in the array, just linear scan.
756
114
      const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
757
114
      ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
758
114
      *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr + offset, params, 0);
759
114
      return tv_pos;
760
0
    }
761
0
    p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
762
0
  }
763
114
  
return 00
;
764
114
}
765
766
// Descent from root to the prep level, and compose multiview from there.
767
static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
768
114
{
769
114
  assert(pos_ref);
770
114
  int i;
771
114
  const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
772
114
  const int unroll_count = prep->unroll_count;
773
114
  if (prep == graph_prep)
774
57
  {
775
57
    const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
776
57
    if (!data_pos)
777
0
      return -1;
778
57
    // Based on ch, go all the way back to find the exact pointer to compose.
779
57
    if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
780
57
      prep->dup_tensor_block_ref &&
781
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] >= 041
&&
782
57
      
prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref41
)
783
41
    {
784
41
      int pos[unroll_count + 1];
785
41
      pos[0] = data_pos;
786
98
      for (i = 0; i < unroll_count; 
i++57
)
787
57
        pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
788
41
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
789
41
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
790
41
      ccv_nnc_tensor_t* data[unroll_count + 1];
791
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
792
98
        data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
793
41
      ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
794
139
      for (i = 0; i < unroll_count + 1; 
i++98
)
795
98
        CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
796
41
      *pos_ref = mv_pos;
797
41
    } else {
798
16
      *pos_ref = data_pos;
799
16
    }
800
57
    if (preserve)
801
5
    {
802
5
      // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
803
5
      // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
804
5
      // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
805
5
      // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
806
5
      // arena allocated).
807
5
      // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
808
5
      // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
809
5
      // it to a K01 structure.
810
5
      // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
811
5
      // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
812
5
      // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
813
5
      int prev_mv_pos = *pos_ref;
814
5
      if (prev_mv_pos == -1)
815
0
      {
816
0
        prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
817
0
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
818
0
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
819
0
        ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
820
0
          tv,
821
0
        }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
822
0
        CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
823
0
      }
824
5
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
825
5
      ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
826
5
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
827
5
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
828
5
        CCV_NNC_TENSOR_PLACEHOLDER,
829
5
        (ccv_nnc_tensor_t*)prev_mv,
830
5
      }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
831
5
      prev_mv->p = (void*)(intptr_t)mv_pos;
832
5
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
833
5
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
834
5
      *pos_ref = mv_pos;
835
5
    }
836
57
    return 0;
837
57
  }
838
57
  ch[idx] = 0;
839
57
  int pos[unroll_count + 1];
840
57
  pos[0] = 0;
841
57
  const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
842
57
  assert(retval == 0);
843
67
  
for (i = 0; 57
i < unroll_count;
i++10
)
844
10
  {
845
10
    ch[idx] = i + 1;
846
10
    pos[i + 1] = 0;
847
10
    const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
848
10
    if (dup_retval < 0)
849
0
    {
850
0
      assert(i == 0);
851
0
      break;
852
0
    }
853
10
  }
854
57
  // If current prep has no dup.
855
57
  if (i == 0)
856
47
  {
857
47
    *pos_ref = pos[0];
858
47
    return 0;
859
47
  }
860
10
  ccv_nnc_tensor_t* data[unroll_count + 1];
861
10
  // Compose to a new multiview.
862
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
863
20
    { assert(pos[i] > 0); }
864
10
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
865
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
866
20
    data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
867
10
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
868
10
  ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
869
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
870
20
    if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER && CCV_IS_TENSOR_MULTIVIEW(data[i]))
871
20
      
((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos4
;
872
30
  for (i = 0; i < unroll_count + 1; 
i++20
)
873
20
    CCV_NNC_MULTIVIEW_DATA(mv)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
874
10
  *pos_ref = mv_pos;
875
10
  return 0;
876
10
}
877
878
static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
879
312
{
880
312
  int i;
881
312
  int is_input = 0;
882
312
  assert(node);
883
766
  
for (i = 0; 312
i < node->input_size &&
!is_input529
;
i++454
)
884
454
    if (p_ref == node->inputs[i])
885
153
      is_input = 1;
886
312
  int is_output = 0;
887
725
  for (i = 0; i < node->output_size && 
!is_output465
;
i++413
)
888
413
    if (p_ref == node->outputs[i])
889
167
      is_output = 1;
890
312
  // Prefer it is an output if it is both the input and the output.
891
312
  if (is_output)
892
167
    return 1;
893
145
  if (is_input)
894
145
    return -1;
895
0
  return 0;
896
0
}
897
898
static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
899
61
{
900
61
  // No need to check whether to preserve if this is not a while loop.
901
61
  if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
902
8
    return 0;
903
53
  assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size);
904
53
  // If it is unassigned, no need to preserve.
905
53
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
906
53
    
return 02
;
907
51
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
908
51
  // If p is not input, no need to preserve at all.
909
51
  if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
910
19
    return 0;
911
32
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
912
32
  assert(vt_ref >= 0);
913
32
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
914
32
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
915
32
  // If the buffer is a truly read-only one, no need to preserve.
916
32
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
917
6
    return 0;
918
26
  /* This needs detailed explanation, what does preserve mean?
919
26
   * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
920
26
   * also used outside of the while loop, we cannot reuse the memory region of x for
921
26
   * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
922
26
   * y uses the same memory region as x). The way to workaround this is by using a different
923
26
   * memory region for y = x + 1, but for the first iteration, having x pointing to the
924
26
   * original. During the allocation process, the way to identify whether x should preserve
925
26
   * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
926
26
   * parent tensor is the same as the memory region it plans to use in the buffer, then we are
927
26
   * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
928
26
   * it is the input tensor whenever that is possible. A tensor block can point to two parent
929
26
   * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
930
26
   * tensor whenever that is possible. */
931
26
  if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
932
15
    return 0;
933
11
  // Otherwise, return 1 because we now need to preserve.
934
11
  return 1;
935
11
}
936
937
static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
938
58
{
939
58
  assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size);
940
58
  // If it is unassigned, no need to preserve.
941
58
  if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref]))
942
58
    
return 00
;
943
58
  // Only tape var need to force broadcast, otherwise we already share the same memory region.
944
58
  if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
945
54
    return 0;
946
4
  const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
947
4
  // If p is not output, no need to broadcast at all.
948
4
  if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
949
3
    return 0;
950
1
  const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
951
1
  assert(vt_ref >= 0);
952
1
  assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref);
953
1
  const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
954
1
  // If the buffer is a truly read-only one, no need to broadcast.
955
1
  if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref]) == READ_ONLY)
956
0
    return 0;
957
1
  // Otherwise, return 1 because we now need to force broadcast for this tape var.
958
1
  return 1;
959
1
}
960
961
static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
962
25
{
963
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
964
25
  int i;
965
78
  for (i = 0; i < mv->kind + mv->repeat; 
i++53
)
966
53
    if (CCV_NNC_MULTIVIEW_DATA(mv)[i] == CCV_NNC_TENSOR_PLACEHOLDER)
967
53
      
CCV_NNC_MULTIVIEW_DATA8
(mv)[i] = tensor8
;
968
45
    else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
969
45
      
_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)7
CCV_NNC_MULTIVIEW_DATA7
(mv)[i], tensor);
970
25
}
971
972
static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
973
25
{
974
25
  assert(CCV_IS_TENSOR_MULTIVIEW(mv));
975
25
  int i;
976
25
  if (mv->sp)
977
8
    
for (i = 0; 2
i < mv->sp->rnum;
i++6
)
978
6
    {
979
6
      ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i);
980
6
      if (CCV_NNC_IS_METADATA_POS(*tensor))
981
6
      {
982
1
        const int pos = (int)(intptr_t)*tensor;
983
1
        *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
984
1
        assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor));
985
1
        _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
986
1
      }
987
6
    }
988
78
  
for (i = 0; 25
i < mv->kind + mv->repeat;
i++53
)
989
53
  {
990
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]))
991
53
      
CCV_NNC_MULTIVIEW_DATA8
(mv)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)8
CCV_NNC_MULTIVIEW_DATA8
(mv)[i]);
992
53
    if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref))
993
53
      
CCV_NNC_MULTIVIEW_DATA0
(mv)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)0
CCV_NNC_MULTIVIEW_DATA0
(mv)[i]->alias_ref);
994
53
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
995
53
      
_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)7
CCV_NNC_MULTIVIEW_DATA7
(mv)[i]);
996
53
  }
997
25
}
998
999
static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1000
47
{
1001
47
  // Go to the root of the graph.
1002
47
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1003
47
  int i;
1004
104
  for (i = 1; prep->p; 
i++57
)
1005
57
    prep = prep->p;
1006
47
  // Root graph should have no dup tensor blocks.
1007
47
  assert(!prep->dup_tensor_block_ref);
1008
47
  const int c = i;
1009
47
  const ccv_nnc_symbolic_graph_prep_t* preps[c];
1010
47
  prep = graph_prep;
1011
47
  preps[c - 1] = prep;
1012
104
  for (i = 0; prep->p; 
i++57
)
1013
57
    preps[c - 2 - i] = prep = prep->p;
1014
47
  int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1015
47
  memset(ch, 0, sizeof(int) * c);
1016
47
  int pos = 0;
1017
47
  _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1018
47
  assert(ch[c - 1] == 0); // This shouldn't never be modified.
1019
47
  assert(pos > 0);
1020
47
  return pos;
1021
47
}
1022
1023
static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1024
3
{
1025
3
  const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1026
3
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1027
3
  ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : 
tensor0
;
1028
3
  ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1029
3
    CCV_NNC_TENSOR_PLACEHOLDER,
1030
3
    tv,
1031
3
  }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1032
3
  CCV_NNC_MULTIVIEW_DATA(mv)[0] = CCV_NNC_TENSOR_PLACEHOLDER;
1033
3
  CCV_NNC_MULTIVIEW_DATA(mv)[1] = tensor;
1034
3
  return mv_pos;
1035
3
}
1036
1037
static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1038
30
{
1039
30
  ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1040
30
  const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr);
1041
30
  if (!is_multiview)
1042
18
    return pos;
1043
24
  
while (12
CCV_IS_TENSOR_MULTIVIEW(tensor_ptr))
1044
12
  {
1045
12
    const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1046
12
    tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1047
12
  }
1048
12
  const ccv_nnc_tensor_t tensor = *tensor_ptr;
1049
12
  const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1050
12
  ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1051
12
  *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1052
12
  ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1053
12
  new_tensor->alias_ref = (uintptr_t)pos;
1054
12
  ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1055
12
  return new_pos;
1056
12
}
1057
1058
static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1059
1.17k
{
1060
1.17k
  // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1061
1.17k
  // Each tensor have the designation in assigned array, and offset in allocated_offset.
1062
1.17k
  const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1063
1.17k
  ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1064
1.17k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1065
1.17k
  const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1066
1.17k
  const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1067
1.17k
  const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? 
p_graph_prep->alloc_prep49
:
01.12k
;
1068
1.17k
  const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1069
1.17k
  const int unroll_count = graph_prep->unroll_count;
1070
1.17k
  int i, j;
1071
21.3k
  for (i = 0; i < tensor_symbol_info_size; 
i++20.2k
)
1072
20.2k
    
for (j = 0; 20.2k
TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) &&
j < unroll_count12.0k
;
j++7
)
1073
7
    {
1074
7
      const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1075
7
      if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref]))
1076
7
        
TENSOR_EXPECT_UNSET_UNASSIGNED3
(tensor_blocks[i]);
1077
7
    }
1078
1.17k
  ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size);
1079
1.17k
  graph_prep->tensor_arena = tensor_arena;
1080
1.17k
  tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1081
1.17k
  tensor_arena->buffers = (void*)(tensor_arena + 1);
1082
1.17k
  tensor_arena->buffer_size = alloc_prep->buffer_size;
1083
1.17k
  tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1084
1.17k
  tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1085
1.17k
  tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1086
1.17k
  tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1087
1.17k
  tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
1088
1.17k
  tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1089
5.43k
  for (i = 0; i < alloc_prep->buffer_size; 
i++4.26k
)
1090
4.26k
    tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1091
4.26k
      tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1092
4.26k
      tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1093
1.17k
  if (graph_prep->while_count_tensor)
1094
19
  {
1095
19
    // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1096
19
    int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1097
19
    assert((0 << 1) + 1 == pos); // pos must be 0 position.
1098
19
    ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1099
19
    *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1100
19
  }
1101
1.17k
  assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep));
1102
1.17k
  if (p_arena && 
p_graph_prep49
)
1103
49
  {
1104
49
    // Don't need to allocate the actual buffer, just use the pointer from the above.
1105
49
    PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena);
1106
230
    for (i = 0; i < tensor_arena->buffer_size; 
i++181
)
1107
181
    {
1108
181
      const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1109
181
      int unref_p_ref = p_ref;
1110
183
      while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1111
2
        unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1112
181
      assert(unref_p_ref >= 0);
1113
181
      const int p_unroll_count = p_graph_prep->unroll_count;
1114
181
      if (p_graph_prep->dup_tensor_block_ref &&
1115
181
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 016
&&
1116
181
        
p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref16
)
1117
10
      {
1118
10
        // This condition means in the parent graph, we point to multiple tensor blocks for the same
1119
10
        // buffer, therefore, we cannot have one single pointer assigned in this case.
1120
10
        // Later we will handle this by generate ccv_tensor_multiview_t structure.
1121
10
        tensor_arena->buffers[i].ptr = 0;
1122
10
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1123
10
        continue;
1124
10
      }
1125
171
      // Otherwise, find the actual buffer pointer.
1126
171
      const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1127
171
      assert(vt_ref >= 0);
1128
171
      const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1129
171
      if (!p_arena->buffers[buffer_ref].ptr)
1130
0
      {
1131
0
        // Pass it down as 0 ptr.
1132
0
        tensor_arena->buffers[i].ptr = 0;
1133
0
        PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i);
1134
0
        continue;
1135
0
      }
1136
171
      const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1137
171
      tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1138
171
      PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset);
1139
171
    }
1140
1.12k
  } else {
1141
1.12k
    // Now, allocate actual buffers.
1142
1.12k
    PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena);
1143
5.20k
    for (i = 0; i < tensor_arena->buffer_size; 
i++4.08k
)
1144
4.08k
    {
1145
4.08k
      const int buffer_type = tensor_arena->buffers[i].type;
1146
4.08k
      const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
1147
4.08k
#ifdef HAVE_CUDA
1148
4.08k
      if (memory_type == CCV_TENSOR_GPU_MEMORY)
1149
544
      {
1150
544
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
1151
544
        tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1152
544
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1153
3.53k
      } else {
1154
3.53k
        assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1155
3.53k
        if (tensor_arena->buffers[i].pin_mem)
1156
10
          tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1157
3.53k
        else
1158
3.53k
          
ccmemalign3.52k
((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size)3.52k
;
1159
3.53k
        PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1160
3.53k
      }
1161
#else
1162
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
1163
      ccmemalign((void**)&tensor_arena->buffers[i].ptr, 16, tensor_arena->buffers[i].size);
1164
      PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size);
1165
#endif
1166
4.08k
      assert(tensor_arena->buffers[i].ptr);
1167
4.08k
    }
1168
1.12k
  }
1169
1.17k
  // Go over sub_preps and allocate arenas for them. Do it this early because
1170
1.17k
  // we may reference tensors from sub arenas, the reason why we need to reference
1171
1.17k
  // tensors from sub arenas is because for output tensors, sub arena's tensor
1172
1.17k
  // will have automatic reference updates.
1173
1.22k
  
for (i = 0; 1.17k
i < tensor_arena->sub_arena_size;
i++50
)
1174
50
    if (graph_prep->sub_preps[i])
1175
49
      tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], tensor_arena, tensor_binds, tensor_bind_size);
1176
1
    else
1177
1
      tensor_arena->sub_arenas[i] = 0;
1178
1.17k
  memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1179
1.17k
  // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1180
1.17k
  ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? 
(ccv_nnc_tensor_t**)29
cccalloc29
(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) :
01.14k
;
1181
1.22k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1182
50
    if (tensor_arena->sub_arenas[i])
1183
49
    {
1184
49
      assert(graph_prep->sub_preps[i]);
1185
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1186
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1187
49
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1188
45
        
for (j = 0; 21
j < node->output_size;
j++24
)
1189
24
        {
1190
24
          const int idx = node->outputs[j];
1191
24
          const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i) - 1;
1192
24
          assert(s_idx >= 0);
1193
24
          ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1194
24
          assert(sub_arena_out_tensors[idx] == 0);
1195
24
          ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1196
24
          // Only assign if it is a multiview tensor.
1197
24
          if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor) ||
1198
24
            
(8
sub_alias8
&&
CCV_IS_TENSOR_MULTIVIEW1
(sub_alias)))
1199
17
            sub_arena_out_tensors[idx] = sub_tensor;
1200
24
        }
1201
49
    }
1202
1.17k
  // Assigning out the tensors (in case of sharing tensors / in-place ops).
1203
21.3k
  
for (i = 0; 1.17k
i < tensor_symbol_info_size;
i++20.2k
)
1204
20.2k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
1205
20.2k
    {
1206
8.06k
      const int vt_ref = alloc_prep->vt_blocks[i];
1207
8.06k
      const int buffer_ref = vt_ref >= 0 ? 
alloc_prep->blocks[vt_ref].buffer_ref8.06k
:
-13
;
1208
8.06k
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1209
8.06k
      // previous layer, therefore, cannot really find the buffer ptr.
1210
8.06k
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[i]103
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1211
8.06k
        
(8.04k
(8.04k
graph_prep->dup_tensor_block_ref8.04k
&&
1212
8.04k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] >= 059
&&
1213
8.04k
          
graph_prep->dup_tensor_block_ref[i * unroll_count] != i57
) ||
1214
8.04k
         
(8.00k
buffer_ref >= 08.00k
&&
!tensor_arena->buffers[buffer_ref].ptr8.00k
)))
1215
47
      {
1216
47
        assert(graph_prep->p); // This must be in a sub-graph.
1217
47
        // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1218
47
        if (graph_prep->tensor_blocks[i].p_refs[0] && 
_ccv_nnc_tensor_block_check_preserve(graph_prep, i)36
)
1219
4
          continue;
1220
43
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1221
43
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1222
43
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1223
8.01k
      } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])) {
1224
8.01k
        // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1225
8.01k
        const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1226
8.01k
        // If already created, use the same tensor, and continue.
1227
8.01k
        // Having ptr.
1228
8.01k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1229
8.01k
        ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1230
8.01k
        // Also, set its allocations.
1231
8.01k
        // Since tensor view is bit compatible with tensor, we can just cast.
1232
8.01k
        *tensor = ccv_nnc_tensor(tensor_arena->buffers[buffer_ref].ptr + offset, tensor_symbol_info[i].info, 0);
1233
8.01k
        assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size);
1234
8.01k
        // If we need to force broadcast, we need to wrap it in a multiview.
1235
8.01k
        if (graph_prep->tensor_blocks[i].p_refs[0] &&
1236
8.01k
          
_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i)58
)
1237
1
        {
1238
1
          const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1239
1
          ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1240
1
          ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1241
1
          ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1242
1
            tv,
1243
1
          }, 0, 1, graph_prep->graph, mv);
1244
1
          CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1245
1
          pos = mv_pos;
1246
1
          ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1247
1
        }
1248
8.01k
        tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1249
8.01k
      }
1250
8.06k
    }
1251
1.17k
  // Handle binded tensors. We handle it here so the alias can reference to binded tensors.
1252
11.0k
  
for (i = 0; 1.17k
i < tensor_bind_size;
i++9.89k
)
1253
9.89k
  {
1254
9.89k
    assert(tensor_binds[i].tensor);
1255
9.89k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1256
9.89k
    if (resolved_symbol.d >= 0)
1257
9.89k
    {
1258
9.89k
      int d = resolved_symbol.d;
1259
10.0k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && tensor_blocks[d].ref)
1260
130
        d = tensor_blocks[d].ref - 1;
1261
9.89k
      // For binded tensors, it shouldn't be assigned yet.
1262
9.89k
      // If it is assigned, the pointer should match the ones from the binded tensor.
1263
9.89k
      // This can only happen if an enforced in-place tensor is binded twice. If that
1264
9.89k
      // happens, we need to make sure it is binded to the same location.
1265
9.89k
      assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8);
1266
9.89k
      if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor))
1267
9.89k
      {
1268
6
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1269
6
        ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1270
6
        memcpy(tv, tensor_binds[i].tensor, sizeof(ccv_nnc_tensor_view_t));
1271
6
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1272
9.88k
      } else {
1273
9.88k
        int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1274
9.88k
        ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1275
9.88k
        *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.ptr, tensor_binds[i].tensor->info, 0);
1276
9.88k
        tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1277
9.88k
      }
1278
9.89k
    }
1279
9.89k
  }
1280
1.17k
  // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1281
21.3k
  
for (i = 0; 1.17k
i < tensor_symbol_info_size;
i++20.2k
)
1282
20.2k
    // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1283
20.2k
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_blocks[i].ref12.0k
&&
!tensor_arena->vt_tensors[i]2.00k
)
1284
2.00k
    {
1285
2.00k
      int ref = tensor_blocks[i].ref - 1;
1286
2.00k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
tensor_blocks[ref].ref133
)
1287
1
        ref = tensor_blocks[ref].ref - 1;
1288
2.00k
      assert(tensor_arena->vt_tensors[ref]);
1289
2.00k
      tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1290
2.00k
    }
1291
1.17k
  // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1292
1.17k
  if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1293
21
  {
1294
21
    assert(graph_prep->p);
1295
21
    const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1296
21
    const int p_idx = graph_prep->p_idx - 1;
1297
46
    for (i = 0; i < node->input_size; 
i++25
)
1298
25
    {
1299
25
      const int idx = node->inputs[i];
1300
25
      int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx) - 1;
1301
25
      assert(!tensor_blocks[block_ref].ref);
1302
25
      const int vt_ref = alloc_prep->vt_blocks[block_ref];
1303
25
      if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1304
18
        continue;
1305
7
      assert(vt_ref >= 0);
1306
7
      const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1307
7
      assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]));
1308
7
      assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]));
1309
7
      // Either we have dup_tensor_block_ref in current layer, or we have that in
1310
7
      // previous layer, therefore, cannot really find the buffer ptr.
1311
7
      if ((!sub_arena_out_tensors || 
!sub_arena_out_tensors[block_ref]0
) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1312
7
        ((graph_prep->dup_tensor_block_ref &&
1313
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 04
&&
1314
7
          
graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref4
) ||
1315
7
         
!tensor_arena->buffers[buffer_ref].ptr3
))
1316
4
      {
1317
4
        // We haven't allocated anything for this yet.
1318
4
        assert(tensor_arena->vt_tensors[block_ref] == 0);
1319
4
        const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1320
4
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1321
4
        ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1322
4
      } else {
1323
3
        const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1324
3
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1325
3
        ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1326
3
      }
1327
7
    }
1328
21
  }
1329
1.17k
  // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1330
1.17k
  // This created the multi-view tensor to achieve that.
1331
21.3k
  
for (i = 0; 1.17k
i < tensor_symbol_info_size;
i++20.2k
)
1332
20.2k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1333
10
    {
1334
10
      const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1335
10
      // Create phi multi-view.
1336
10
      const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1337
10
      const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1338
10
      const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1339
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1340
10
      ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1341
10
      ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1342
10
      ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1343
10
        intv,
1344
10
        outv,
1345
10
      }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI, mv);
1346
10
      CCV_NNC_MULTIVIEW_DATA(mv)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1347
10
      CCV_NNC_MULTIVIEW_DATA(mv)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1348
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1349
10
      ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1350
10
    }
1351
1.17k
  // Now it is time to handle alias.
1352
9.48k
  for (i = 0; i < alloc_prep->block_size; 
i++8.31k
)
1353
8.31k
    if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1354
8.17k
    {
1355
8.17k
      const int block_ref = alloc_prep->blocks[i].block_ref;
1356
8.17k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1357
8.17k
      {
1358
109
        // Assigning out the tensor aliases.
1359
109
        assert(tensor_symbol_info[block_ref].alias_ref);
1360
109
        const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1361
109
        // It referenced to is not an alias.
1362
109
        assert(tensor_arena->vt_tensors[alias_ref]);
1363
109
        // If this is not alias (it is binded then).
1364
109
        if (!CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[alias_ref]))
1365
109
        {
1366
0
          int pos;
1367
0
          if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1368
0
            memcmp(tensor_symbol_info[block_ref].inc, tensor_symbol_info[block_ref].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
1369
0
          {
1370
0
            pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1371
0
            ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1372
0
            *tensor = ccv_nnc_tensor(tensor_arena->vt_tensors[alias_ref]->data.u8, tensor_symbol_info[block_ref].info, 0);
1373
0
          } else {
1374
0
            pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1375
0
            ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1376
0
            // Otherwise initialize a tensor view
1377
0
            *tensor_view = ccv_nnc_tensor_view(tensor_arena->vt_tensors[alias_ref], tensor_symbol_info[block_ref].info.dim, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].inc);
1378
0
            tensor_view->alias_ref = (uintptr_t)tensor_arena->vt_tensors[alias_ref];
1379
0
          }
1380
0
          tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1381
0
          continue;
1382
0
        }
1383
109
        const int alias_pos = (int)(intptr_t)tensor_arena->vt_tensors[alias_ref];
1384
109
        const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1385
109
        assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr));
1386
109
        // Will use that to determine whether insert reference or not.
1387
109
        const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr);
1388
122
        while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr))
1389
109
        {
1390
13
          const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1391
13
          alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[0]);
1392
13
        }
1393
109
        const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1394
109
        // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1395
109
        int pos;
1396
109
        if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1397
109
          
memcmp(tensor_symbol_info[block_ref].inc, tensor_symbol_info[block_ref].info.dim, sizeof(int) * 87
CCV_NNC_MAX_DIM_ALLOC87
) == 0)
1398
54
        {
1399
54
          pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1400
54
          ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1401
54
          *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1402
55
        } else {
1403
55
          pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1404
55
          ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1405
55
          // Otherwise initialize a tensor view
1406
55
          *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info.dim, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].inc);
1407
55
          tensor_view->alias_ref = (uintptr_t)alias_pos;
1408
55
        }
1409
109
        tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1410
109
        if (is_multiview)
1411
13
        {
1412
13
          ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, alias_pos);
1413
13
          ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1414
13
        }
1415
109
      }
1416
8.17k
    }
1417
1.17k
  // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1418
1.22k
  
for (i = 0; 1.17k
i < tensor_arena->sub_arena_size;
i++50
)
1419
50
    if (tensor_arena->sub_arenas[i])
1420
49
    {
1421
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1422
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1423
138
      for (j = 0; j < node->input_size; 
j++89
)
1424
89
      {
1425
89
        const int idx = node->inputs[j];
1426
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1427
89
        if (s_idx < 0)
1428
23
          continue;
1429
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1430
66
        // Only do the replacement if it is a multi-view tensor.
1431
66
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its peer.
1432
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor) &&
!18
TENSOR_EXPECT_UNASSIGNED18
(tensor_blocks[idx]))
1433
66
        {
1434
18
          // It cannot be binded tensor.
1435
18
          assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]));
1436
18
          const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1437
18
          const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1438
18
          ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? 
sub_arena_out_tensors[idx]1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)17
;
1439
18
          // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1440
18
          // to this tensor.
1441
18
          if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor))
1442
18
          {
1443
6
            const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1444
6
            ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1445
6
            ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? 
vt_tensor1
:
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos)5
);
1446
6
            ref_tensor->alias_ref = is_sub_arena_out_tensor ? 
(uintptr_t)vt_tensor1
:
(uintptr_t)vt_pos5
;
1447
6
            ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1448
6
            ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0]) ? 
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)5
CCV_NNC_MULTIVIEW_DATA5
(multiview)[0]) :
CCV_NNC_MULTIVIEW_DATA1
(multiview)[0]1
);
1449
6
            while (CCV_IS_TENSOR_MULTIVIEW(tv))
1450
6
              
tv = (ccv_nnc_tensor_t*)(0
CCV_NNC_IS_METADATA_POS0
(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0]) ?
_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)0
CCV_NNC_MULTIVIEW_DATA0
((ccv_nnc_tensor_multiview_t*)tv)[0]) :
CCV_NNC_MULTIVIEW_DATA0
((ccv_nnc_tensor_multiview_t*)tv)[0]0
);
1451
6
            *ref_tensor = ccv_nnc_tensor(tv->data.ptr, tv->info, 0);
1452
6
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1453
6
          } else
1454
12
            _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? 
vt_tensor0
: (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1455
18
        }
1456
66
      }
1457
49
    }
1458
1.17k
  // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1459
1.17k
  // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1460
1.17k
  // when initialize case..of node, which will take the phi multi-view again.
1461
21.3k
  
for (i = 0; 1.17k
i < tensor_symbol_info_size;
i++20.2k
)
1462
20.2k
    if (tensor_blocks[i].bypass_ref && 
tensor_arena->vt_tensors[i]10
)
1463
10
    {
1464
10
      assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]));
1465
10
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1466
10
      assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI);
1467
10
      tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1468
10
    }
1469
1.17k
  // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1470
21.3k
  
for (i = 0; 1.17k
i < tensor_symbol_info_size;
i++20.2k
)
1471
20.2k
    if (tensor_arena->vt_tensors[i])
1472
20.0k
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1473
1.17k
  // Associate multiview tensors from sub arena to the parent.
1474
1.17k
  if (sub_arena_out_tensors)
1475
29
  {
1476
243
    for (i = 0; i < alloc_prep->block_size; 
i++214
)
1477
214
      if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1478
113
      {
1479
113
        const int block_ref = alloc_prep->blocks[i].block_ref;
1480
113
        if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))
1481
113
          
continue0
;
1482
113
        int sub_arena_ref = block_ref;
1483
113
        if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))
1484
113
        {
1485
10
          // Assigning out the tensor aliases.
1486
10
          assert(tensor_symbol_info[block_ref].alias_ref);
1487
10
          const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1488
10
          // It referenced to is not an alias.
1489
10
          assert(tensor_arena->vt_tensors[alias_ref]);
1490
10
          sub_arena_ref = alias_ref;
1491
10
          if (!sub_arena_out_tensors[sub_arena_ref])
1492
3
            continue;
1493
110
        }
1494
110
        if (!sub_arena_out_tensors[sub_arena_ref])
1495
86
          continue;
1496
24
        ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref]) ? 
sub_arena_out_tensors[sub_arena_ref]23
:
(ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref1
);
1497
24
        assert(CCV_IS_TENSOR_MULTIVIEW(mv));
1498
24
        // This is only possible if the vt_tensors is a phi node.
1499
24
        if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1500
0
        {
1501
0
          // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1502
0
          ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1503
0
          assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI);
1504
0
          assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]));
1505
0
          CCV_NNC_MULTIVIEW_DATA(phi)[1]->alias_ref = (uintptr_t)mv;
1506
0
          ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)[1]);
1507
24
        } else {
1508
24
          tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1509
24
          ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1510
24
        }
1511
24
      }
1512
29
  }
1513
1.17k
  // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1514
1.17k
  // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1515
1.17k
  // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1516
1.17k
  // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1517
1.17k
  // to the output of assign_ref tensor.
1518
21.3k
  
for (i = 0; 1.17k
i < tensor_symbol_info_size;
i++20.2k
)
1519
20.2k
    if (tensor_arena->vt_tensors[i] && 
tensor_symbol_info[i].assign_ref20.0k
)
1520
25
    {
1521
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1522
25
      ccv_nnc_tensor_t* assign_tensor;
1523
25
      if (sub_arena_out_tensors && 
sub_arena_out_tensors[assign_ref]3
)
1524
0
        assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref]) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1525
25
      else
1526
25
        assign_tensor = tensor_arena->vt_tensors[assign_ref];
1527
25
      ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1528
25
    }
1529
1.17k
  if (sub_arena_out_tensors)
1530
1.17k
    
ccfree29
(sub_arena_out_tensors)29
;
1531
1.17k
  // Rewire sub arena's tensor references.
1532
1.22k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
1533
50
    if (tensor_arena->sub_arenas[i])
1534
49
    {
1535
49
      const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1536
49
      const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1537
138
      for (j = 0; j < node->input_size; 
j++89
)
1538
89
      {
1539
89
        const int idx = node->inputs[j];
1540
89
        const int s_idx = (tensor_symbol_info[idx].s_ref && 
tensor_symbol_info[idx].s_ref->rnum > i87
) ?
*(int*)78
ccv_array_get78
(tensor_symbol_info[idx].s_ref, i) - 1 :
-111
;
1541
89
        if (s_idx < 0)
1542
23
          continue;
1543
66
        ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1544
66
        // Only do the replacement if it is a multi-view tensor.
1545
66
        // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its peer.
1546
66
        if (sub_tensor && 
CCV_IS_TENSOR_MULTIVIEW63
(sub_tensor))
1547
66
        {
1548
18
          // This is binded tensor, bind it now.
1549
18
          if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx]))
1550
18
            
_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx])0
;
1551
18
          else
1552
18
            _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1553
18
        }
1554
66
      }
1555
49
    }
1556
1.17k
  return tensor_arena;
1557
1.17k
}
1558
1559
static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_peer_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int peer_ref)
1560
17
{
1561
17
  assert(graph);
1562
17
  if ((intptr_t)graph == tensor_arena->graph_ref)
1563
7
  {
1564
7
    assert(peer_ref >= 0 && peer_ref < tensor_arena->vt_tensor_size);
1565
7
    return tensor_arena->vt_tensors[peer_ref];
1566
10
  }
1567
10
  int i;
1568
13
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++3
)
1569
10
    if (tensor_arena->sub_arenas[i])
1570
10
    {
1571
10
      ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_peer_ref(tensor_arena->sub_arenas[i], graph, peer_ref);
1572
10
      if (tensor)
1573
7
        return tensor;
1574
10
    }
1575
10
  
return 03
;
1576
10
}
1577
1578
static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1579
7
{
1580
7
  if (!CCV_IS_TENSOR_MULTIVIEW(tensor))
1581
7
    
tensor->type |= CCV_TAPE_ALLOC5
;
1582
2
  else {
1583
2
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1584
2
    mv->type |= CCV_TAPE_ALLOC;
1585
2
    int i;
1586
5
    for (i = 0; i < mv->repeat + mv->kind; 
i++3
)
1587
3
      _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1588
2
  }
1589
7
}
1590
1591
static void _ccv_nnc_tensor_arena_fixup_peer_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1592
1.17k
{
1593
1.17k
  assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
1594
1.17k
  int i;
1595
21.3k
  for (i = 0; i < graph_prep->tensor_symbol_info_size; 
i++20.2k
)
1596
20.2k
  {
1597
20.2k
    if (graph_prep->tensor_symbol_info[i].peer_ref)
1598
7
    {
1599
7
      tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_peer_ref(root_arena, graph_prep->symbolic_graph->peer, graph_prep->tensor_symbol_info[i].peer_ref - 1);
1600
7
      // No need to continue check this if it is from its peer.
1601
7
      continue;
1602
7
    }
1603
20.1k
    if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && 
tensor_arena->vt_tensors[i]7
)
1604
7
    {
1605
7
      // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1606
7
      if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i]))
1607
7
      {
1608
5
        const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1609
5
        if (vt_ref >= 0 &&
1610
5
          TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref]) == READ_ONLY)
1611
3
          continue;
1612
4
      }
1613
4
      _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1614
4
    }
1615
20.1k
  }
1616
1.22k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
1617
50
    if (graph_prep->sub_preps[i])
1618
49
      _ccv_nnc_tensor_arena_fixup_peer_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1619
1.17k
}
1620
1621
static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1622
35.5k
{
1623
35.5k
  int i, found = 0;
1624
35.5k
  // Try to insert head.
1625
35.5k
  ccv_array_t* head = tensor_blocks.head;
1626
35.5k
  assert(head);
1627
37.0k
  
for (i = 0; 35.5k
i < head->rnum;)
1628
18.2k
  {
1629
18.2k
    const int head_idx = *(int*)ccv_array_get(head, i);
1630
18.2k
    if (head_idx == idx)
1631
109
    {
1632
109
      found = 1;
1633
109
      break;
1634
109
    }
1635
18.0k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1636
18.0k
    if (cell.i32 && 
cell.i32[0] > 041
)
1637
41
    {
1638
41
      /* If the current node is the parent of the head node, check if we found it or not. */
1639
41
      /* If not found, replace the current one. */
1640
41
      if (!found)
1641
41
      {
1642
41
        found = 1;
1643
41
        *(int*)ccv_array_get(head, i) = idx;
1644
41
      } else {
1645
0
        /* Remove the current one, change the rnum. */
1646
0
        if (i < head->rnum - 1)
1647
0
          *(int*)ccv_array_get(head, i) = *(int*)ccv_array_get(head, head->rnum - 1);
1648
0
        --head->rnum;
1649
0
        continue;
1650
0
      }
1651
18.0k
    } else {
1652
18.0k
      // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1653
18.0k
      cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1654
18.0k
      if (cell.i32 && 
cell.i32[0] > 016.6k
)
1655
16.6k
      {
1656
16.6k
        found = 1;
1657
16.6k
        break;
1658
16.6k
      }
1659
1.43k
    }
1660
1.43k
    /* Advancing i. */
1661
1.43k
    ++i;
1662
1.43k
  }
1663
35.5k
  /* If not found, push this idx to the end of the array. */
1664
35.5k
  if (!found)
1665
18.7k
    ccv_array_push(head, &idx);
1666
35.5k
  // Try to insert tail.
1667
35.5k
  found = 0;
1668
35.5k
  ccv_array_t* tail = tensor_blocks.tail;
1669
35.5k
  assert(tail);
1670
53.2k
  
for (i = 0; 35.5k
i < tail->rnum;)
1671
18.9k
  {
1672
18.9k
    const int tail_idx = *(int*)ccv_array_get(tail, i);
1673
18.9k
    if (tail_idx == idx)
1674
1.17k
    {
1675
1.17k
      found = 1;
1676
1.17k
      break;
1677
1.17k
    }
1678
17.7k
    ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1679
17.7k
    if (cell.i32 && 
cell.i32[0] > 016.2k
)
1680
16.2k
    {
1681
16.2k
      /* If the current node is the child of the tail node, check if we found it or not. */
1682
16.2k
      /* If not found, replace the current one. */
1683
16.2k
      if (!found)
1684
15.4k
      {
1685
15.4k
        found = 1;
1686
15.4k
        *(int*)ccv_array_get(tail, i) = idx;
1687
15.4k
      } else {
1688
710
        /* Remove the current one, change the rnum. */
1689
710
        *(int*)ccv_array_get(tail, i) = *(int*)ccv_array_get(tail, tail->rnum - 1);
1690
710
        --tail->rnum;
1691
710
        continue;
1692
710
      }
1693
1.53k
    } else {
1694
1.53k
      // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1695
1.53k
      cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1696
1.53k
      if (cell.i32 && 
cell.i32[0] > 0110
)
1697
110
      {
1698
110
        found = 1;
1699
110
        break;
1700
110
      }
1701
16.9k
    }
1702
16.9k
    /* Advancing i. */
1703
16.9k
    ++i;
1704
16.9k
  }
1705
35.5k
  /* If not found, push this idx to the end of the array. */
1706
35.5k
  if (!found)
1707
18.8k
    ccv_array_push(tail, &idx);
1708
35.5k
}
1709
1710
ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1711
1.20k
{
1712
1.20k
  if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1713
1.10k
  {
1714
1.10k
    assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size);
1715
1.10k
    ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1716
1.10k
    if (tensor && 
CCV_IS_TENSOR_MULTIVIEW1.10k
(tensor))
1717
1.10k
    {
1718
11
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1719
22
      while (CCV_IS_TENSOR_MULTIVIEW(mv))
1720
11
        mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? 
mv->it1
:
CCV_NNC_MULTIVIEW_DATA10
(mv)[0]10
);
1721
11
      return (ccv_nnc_tensor_t*)mv;
1722
11
    }
1723
1.09k
    return tensor;
1724
1.09k
  }
1725
100
  int i;
1726
123
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++23
)
1727
99
    if (tensor_arena->sub_arenas[i])
1728
99
    {
1729
99
      ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1730
99
      if (tensor)
1731
76
        return tensor;
1732
99
    }
1733
100
  
return 024
;
1734
100
}
1735
1736
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1737
716k
{
1738
716k
  if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1739
716k
  {
1740
716k
    assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size);
1741
716k
    return graph_exec_arena->graph_execs[symbol.d];
1742
7
  }
1743
7
  int i;
1744
9
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++2
)
1745
7
    if (graph_exec_arena->sub_arenas[i])
1746
7
    {
1747
7
      ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1748
7
      if (!CCV_NO_GRAPH_EXEC(exec))
1749
7
        
return exec5
;
1750
7
    }
1751
7
  
return (ccv_nnc_graph_exec_t){}2
; // 0.
1752
7
}
1753
1754
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1755
9
{
1756
9
  return graph_exec_arena->source;
1757
9
}
1758
1759
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1760
9
{
1761
9
  return graph_exec_arena->destination;
1762
9
}
1763
1764
// Check whether the head is the beginning of this block.
1765
static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1766
50
{
1767
50
  assert(tensor_block->head);
1768
50
  return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0) == head_node);
1769
50
}
1770
1771
// Check whether the tail is the end of this block.
1772
static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
1773
39
{
1774
39
  assert(tensor_block->tail);
1775
39
  return (tensor_block->tail->rnum == 1 && 
*(int*)36
ccv_array_get36
(tensor_block->tail, 0) == tail_node);
1776
39
}
1777
1778
// Make two tensor blocks one. Return 1 if that happened.
1779
static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
1780
2.17k
{
1781
2.17k
  // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
1782
2.17k
  if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0]) &&
1783
2.17k
    
(2.14k
!2.14k
TENSOR_IS_UNFOLDABLE_AS_OUTPUT2.14k
(tensor_blocks[p_ref_1]) ||
tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 118
) &&
1784
2.17k
    
tensor_blocks[p_ref_0].tail->rnum == 12.13k
&&
1785
2.17k
    
tensor_blocks[p_ref_1].head->rnum == 12.13k
&&
1786
2.17k
    
tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type2.13k
&& // Must be the same type.
1787
2.17k
    
*(int*)2.12k
ccv_array_get2.12k
(tensor_blocks[p_ref_0].tail, 0) == *(int*)
ccv_array_get2.12k
(tensor_blocks[p_ref_1].head, 0))
1788
2.17k
  {
1789
2.00k
    // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
1790
2.00k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]));
1791
2.00k
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]));
1792
2.00k
    ccv_array_free(tensor_blocks[p_ref_0].tail);
1793
2.00k
    tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
1794
2.00k
    if (tensor_blocks[p_ref_1].p_refs[0])
1795
14
    {
1796
14
      assert(tensor_blocks[p_ref_1].p_refs[1] == 0); // It simply cannot have more than one p_refs, otherwise we cannot merge.
1797
14
      if (!tensor_blocks[p_ref_0].p_refs[0])
1798
10
        tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
1799
4
      else
1800
4
        tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
1801
14
    }
1802
2.00k
    tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
1803
2.00k
    TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]));
1804
2.00k
    ccv_array_free(tensor_blocks[p_ref_1].head);
1805
2.00k
    if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1]))
1806
2.00k
      
TENSOR_SET_UNFOLDABLE_AS_INPUT16
(tensor_blocks[p_ref_0]);
1807
2.00k
    // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
1808
2.00k
    TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1]);
1809
2.00k
    tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
1810
2.00k
    if (!tensor_blocks[p_ref_0].r_refs)
1811
1.96k
      tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
1812
2.00k
    ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
1813
2.00k
    tensor_blocks[p_ref_1].size = 0;
1814
2.00k
    tensor_blocks[p_ref_1].head = 0;
1815
2.00k
    tensor_blocks[p_ref_1].tail = 0;
1816
2.00k
    return 1;
1817
170
  }
1818
170
  return 0;
1819
170
}
1820
1821
static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
1822
1.19k
{
1823
1.19k
  int i, j, k;
1824
1.19k
  // Generate exec dependencies (or, in other words, partial ordering of executions).
1825
1.19k
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
1826
1.19k
  int* buf = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
1827
1.19k
  int buf_size;
1828
1.19k
  if (p_node_info)
1829
62
    { assert(output_size == 0); }
1830
1.19k
#define for_block(x, val) \
1831
89.5k
  do { \
1832
89.5k
    if (((int32_t*)val)[0] > 0) \
1833
89.5k
    { \
1834
89.5k
      buf[buf_size * 2] = x; \
1835
89.5k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
1836
89.5k
      ++buf_size; \
1837
89.5k
    } \
1838
89.5k
  } while (0)
1839
7.70k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
1840
7.70k
    buf_size = 0; /* save all its parent deps to this buffer */
1841
7.70k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
1842
7.70k
    if (vector)
1843
89.5k
      
CCV_SPARSE_VECTOR_FOREACH6.40k
(exec_dep, vector, for_block);
1844
7.70k
    if (!node->outgoings)
1845
1.46k
      continue;
1846
13.5k
    
for (i = 0; 6.24k
i < node->outgoings->rnum;
i++7.31k
)
1847
7.31k
    {
1848
7.31k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
1849
7.31k
      const int32_t one = 1;
1850
7.31k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
1851
7.31k
      /* If not found, set, if the current node is the destination node, no need 
1852
7.31k
       * set itself as parent of subsequent nodes because its terminal nature. */
1853
7.31k
      if (!term && 
(7.27k
!cell.i327.27k
||
cell.i32[0] == 00
))
1854
7.27k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
1855
98.1k
      for (j = 0; j < buf_size; 
j++90.7k
) /* set with all idx's dependencies as well */
1856
90.7k
      {
1857
90.7k
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
1858
90.7k
        /* If not found, set */
1859
90.7k
        if (!cell.i32 || 
cell.i32[0] == 08.24k
)
1860
82.5k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
1861
8.24k
        else {
1862
8.24k
          /* Otherwise, set to the longest one */
1863
8.24k
          int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
1864
8.24k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
1865
8.24k
        }
1866
90.7k
      }
1867
7.31k
    }
1868
6.24k
  } ccv_nnc_graph_visit_endfor
1869
1.19k
#undef for_block
1870
1.19k
  ccfree(buf);
1871
1.19k
  // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
1872
1.19k
  const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
1873
1.19k
  ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
1874
1.19k
  // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
1875
1.19k
  // happens that I have to loop through all relevant node to find out if one is used or not.
1876
21.5k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++20.3k
)
1877
20.3k
    tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
1878
7.70k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
1879
29.9k
    for (i = 0; i < node->input_size; 
i++22.2k
)
1880
22.2k
      if (node->inputs[i] >= 0)
1881
16.9k
      {
1882
16.9k
        tensor_blocks[node->inputs[i]].flags = 0;
1883
16.9k
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
1884
16.9k
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
1885
16.9k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
1886
16.9k
          
(14.3k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD14.3k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD14.3k
))
1887
11
          tensor_blocks[node->inputs[i]].pin_mem = 1;
1888
16.9k
      }
1889
20.7k
    for (i = 0; i < node->output_size; 
i++12.9k
)
1890
12.9k
      if (node->outputs[i] >= 0)
1891
11.8k
      {
1892
11.8k
        tensor_blocks[node->outputs[i]].flags = 0;
1893
11.8k
        // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
1894
11.8k
        // This will get propagated back to the buffer, and used there to determine the allocation function to use.
1895
11.8k
        if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type) == CCV_TENSOR_CPU_MEMORY &&
1896
11.8k
          
(9.87k
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD9.87k
||
node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD9.87k
))
1897
9
          tensor_blocks[node->outputs[i]].pin_mem = 1;
1898
11.8k
      }
1899
7.70k
  } ccv_nnc_graph_visit_endfor
1900
1.19k
  if (p_node_info)
1901
62
  {
1902
62
    assert(p_tensor_symbol_info);
1903
62
    // Mark it as used if it is used in either input or output.
1904
165
    
for (i = 0; 62
i < p_node_info->input_size;
i++103
)
1905
103
      if (p_node_info->inputs[i] >= 0)
1906
103
      {
1907
103
        const int d = p_node_info->inputs[i];
1908
103
        if (p_tensor_symbol_info[d].s_ref && 
p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx101
)
1909
92
        {
1910
92
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
1911
92
          if (dd >= 0) // If this exists in this sub-graph, great.
1912
80
            tensor_blocks[dd].flags = 0;
1913
92
        }
1914
103
      }
1915
132
    for (i = 0; i < p_node_info->output_size; 
i++70
)
1916
70
      if (p_node_info->outputs[i] >= 0)
1917
70
      {
1918
70
        const int d = p_node_info->outputs[i];
1919
70
        if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
1920
70
        {
1921
70
          const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1) - 1;
1922
70
          if (dd >= 0) // If this exists in this sub-graph, great.
1923
70
            tensor_blocks[dd].flags = 0;
1924
70
        }
1925
70
      }
1926
62
  }
1927
21.5k
  
for (i = 0; 1.19k
i < symbolic_graph->tensor_symbol_info->rnum;
i++20.3k
)
1928
20.3k
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]))
1929
20.3k
    {
1930
18.1k
      // Check no tensor info is auto now.
1931
18.1k
      assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info));
1932
18.1k
      // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
1933
18.1k
      // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
1934
18.1k
      // fold to).
1935
18.1k
      if (tensor_symbol_info[i].assign_ref)
1936
40
      {
1937
40
        // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
1938
40
        // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
1939
40
        // it kept its own representation, which is not the case for output).
1940
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
1941
40
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1942
40
        // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
1943
40
        TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref]);
1944
40
        // It also cannot be folded as output (except i), because we need to keep its own representation.
1945
40
        TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref]);
1946
40
        assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0);
1947
40
        tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
1948
63
        for (j = 0; j < unroll_count; 
j++23
)
1949
23
        {
1950
23
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
1951
23
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]);
1952
23
        }
1953
40
        if (tensor_blocks[assign_ref].bypass_ref)
1954
4
        {
1955
4
          // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
1956
4
          tensor_blocks[assign_ref].unfoldable_except_ref = 0;
1957
4
          const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
1958
4
          TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref]);
1959
4
          TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref]);
1960
4
          // On the other hand, it can be folded into the except_ref for the bypass_ref.
1961
4
          tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
1962
4
          if (dup_tensor_from_ref)
1963
2
          {
1964
2
            const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
1965
2
            if (bypass_from_ref >= 0)
1966
2
            {
1967
2
              TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref]);
1968
2
              TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref]);
1969
2
              assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref);
1970
2
              for (j = 0; j < unroll_count - 1; 
j++0
)
1971
0
              {
1972
0
                // Mark every incarnation as unfold-able.
1973
0
                TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
1974
0
                TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]]);
1975
0
              }
1976
2
            }
1977
2
          }
1978
4
        }
1979
40
      }
1980
18.1k
    }
1981
21.5k
  
for (i = 0; 1.19k
i < symbolic_graph->tensor_symbol_info->rnum;
i++20.3k
)
1982
20.3k
  {
1983
20.3k
    // If it has a peer reference, we don't need to allocate this tensor at all,
1984
20.3k
    // set it to be unassigned.
1985
20.3k
    if (tensor_symbol_info[i].peer_ref)
1986
20.3k
      
TENSOR_EXPECT_SET_UNASSIGNED15
(tensor_blocks[i]);
1987
20.3k
    // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
1988
20.3k
    else 
if (20.3k
tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR20.3k
) {
1989
7
      TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
1990
7
      TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i]);
1991
7
      // For this case, there is no exception.
1992
7
      tensor_blocks[i].unfoldable_except_ref = 0;
1993
20.3k
    } else if (tensor_symbol_info[i].p_ref) {
1994
119
      assert(p_node_info);
1995
119
      const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
1996
119
      // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
1997
119
      if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1998
48
        // TODO: This check can be lifted if we can fold in the parent graph.
1999
48
        if (-1 == p_ref_is_in_or_out)
2000
48
          
TENSOR_SET_UNFOLDABLE_AS_INPUT20
(tensor_blocks[i]);
2001
119
      if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2002
119
        
TENSOR_SET_UNFOLDABLE_AS_INPUT68
(tensor_blocks[i]);
2003
119
    }
2004
20.3k
  }
2005
21.5k
  
for (i = 0; 1.19k
i < symbolic_graph->tensor_symbol_info->rnum;
i++20.3k
)
2006
20.3k
  {
2007
20.3k
    if (tensor_symbol_info[i].alias_ref)
2008
151
    {
2009
151
      const int ref = tensor_symbol_info[i].alias_ref - 1;
2010
151
      // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2011
151
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]) && 
!27
TENSOR_EXPECT_UNASSIGNED27
(tensor_blocks[i]))
2012
151
        
tensor_blocks[ref].flags = 023
;
2013
151
      // An alias cannot ref to another alias.
2014
151
      assert(!tensor_symbol_info[ref].alias_ref);
2015
151
      tensor_blocks[i].flags = ALIAS;
2016
151
      tensor_blocks[i].ref = ref + 1; // Assign the ref.
2017
151
      if (!tensor_blocks[ref].r_refs)
2018
118
        tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2019
151
      ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2020
151
    }
2021
20.3k
  }
2022
1.19k
  // Scan again and if the ref is not assigned, mark the alias not assigned.
2023
21.5k
  
for (i = 0; 1.19k
i < symbolic_graph->tensor_symbol_info->rnum;
i++20.3k
)
2024
20.3k
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[i]))
2025
20.3k
    {
2026
151
      const int ref = tensor_blocks[i].ref - 1;
2027
151
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref]))
2028
151
      {
2029
4
        // Mark this as unassigned.
2030
4
        tensor_blocks[i].flags = UNASSIGNED;
2031
4
        tensor_blocks[i].ref = 0;
2032
4
      }
2033
151
    }
2034
21.5k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++20.3k
)
2035
20.3k
  {
2036
20.3k
    // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2037
20.3k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]))
2038
20.3k
    {
2039
18.0k
      tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2040
18.0k
      tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2041
18.0k
      // Cache tensor size (align to 16 bytes).
2042
18.0k
      tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2043
18.0k
    }
2044
20.3k
    // If there is a p_ref, add the one to the p_refs list.
2045
20.3k
    if (tensor_symbol_info[i].p_ref)
2046
128
      tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2047
20.3k
  }
2048
7.70k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2049
29.9k
    for (i = 0; i < node->input_size; 
i++22.2k
)
2050
22.2k
    {
2051
22.2k
      int d = node->inputs[i];
2052
22.2k
      if (d < 0)
2053
5.22k
        continue;
2054
16.9k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2055
16.9k
        
d = tensor_symbol_info[d].alias_ref - 1116
;
2056
16.9k
      tensor_blocks[d].flags |= READ_ONLY;
2057
16.9k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2058
16.9k
        
continue15
;
2059
16.9k
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2060
16.9k
      /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2061
16.9k
       * from the very beginning of the graph life-cycle and ends here. */
2062
16.9k
      if (tensor_blocks[d].head->rnum == 0 && 
!6.23k
TENSOR_REQUIRE_INIT6.23k
(tensor_symbol_info[d].flags))
2063
16.9k
      {
2064
15.2k
        for (j = 0; j < source_size; 
j++9.00k
)
2065
9.00k
        {
2066
9.00k
          // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2067
9.00k
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2068
9.00k
          if (cell.i32 && 
cell.i32[0] > 06.56k
)
2069
6.56k
            _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2070
9.00k
        }
2071
6.19k
        /* If this is a read-only (based on SSA, if first encountered as read), and this is
2072
6.19k
         * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2073
6.19k
         * loop, however, in that case, you need to prevent read-only gets reused for the
2074
6.19k
         * output tensor, which is not obvious how to implement correctly), and it is not
2075
6.19k
         * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2076
6.19k
         * of memory anyway (because on second loop, we want to read the same value out).
2077
6.19k
         * Mark it to the end of the graph. */
2078
6.19k
        if (p_node_info && 
!tensor_symbol_info[d].assign_ref146
)
2079
210
          
for (j = 0; 105
j < destination_size;
j++105
)
2080
105
          {
2081
105
            // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2082
105
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2083
105
            if (cell.i32 && 
cell.i32[0] > 065
)
2084
65
              _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2085
105
          }
2086
6.19k
      }
2087
16.9k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2088
16.9k
    }
2089
20.7k
    
for (i = 0; 7.70k
i < node->output_size;
i++12.9k
)
2090
12.9k
    {
2091
12.9k
      int d = node->outputs[i];
2092
12.9k
      if (d < 0)
2093
1.18k
        continue;
2094
11.8k
      if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2095
11.8k
        
d = tensor_symbol_info[d].alias_ref - 161
;
2096
11.8k
      tensor_blocks[d].flags |= WRITE_ONLY;
2097
11.8k
      if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2098
11.8k
        
continue0
;
2099
11.8k
      assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2100
11.8k
      _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2101
11.8k
    }
2102
7.70k
  } ccv_nnc_graph_visit_endfor
2103
1.19k
  // For any assign_ref, its life-time kept until the end and wrap over.
2104
21.5k
  
for (i = 0; 1.19k
i < symbolic_graph->tensor_symbol_info->rnum;
i++20.3k
)
2105
20.3k
    // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2106
20.3k
    // that "somewhere else" need to keep its life-time til the end.
2107
20.3k
    if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) &&
2108
20.3k
      
p_node_info18.0k
&&
tensor_symbol_info[i].assign_ref282
)
2109
42
    {
2110
42
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2111
84
      for (j = 0; j < destination_size; 
j++42
)
2112
42
      {
2113
42
        // This logic is to be more conservative about which destination we add to.
2114
42
        // As of now, if we add everything, it is fine most likely. However, it may
2115
42
        // cause issues in the future to do so naively. Thus, instead, we only add
2116
42
        // the destination to it iff either the tensor is not used at all, or, the
2117
42
        // destination is on the same stream as of the tensor block some way.
2118
42
        int flag = !tensor_blocks[assign_ref].tail;
2119
83
        for (k = 0; !flag && 
k < tensor_blocks[assign_ref].tail->rnum73
;
k++41
)
2120
41
        {
2121
41
          const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k);
2122
41
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2123
41
          flag = (cell.i32 && 
cell.i32[0] > 010
);
2124
41
        }
2125
42
        if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2126
10
          _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2127
42
      }
2128
42
    }
2129
1.22k
  for (i = 0; i < output_size; 
i++30
)
2130
30
  {
2131
30
    assert(outputs[i].graph == symbolic_graph);
2132
30
    int d = outputs[i].d;
2133
30
    if (d < 0)
2134
0
      continue;
2135
30
    if (TENSOR_EXPECT_ALIAS(tensor_blocks[d]))
2136
30
      
d = tensor_symbol_info[d].alias_ref - 10
;
2137
30
    if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]))
2138
30
      
continue0
;
2139
30
    assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
2140
180
    
for (j = 0; 30
j < destination_size;
j++150
)
2141
150
    {
2142
150
      int flag = !tensor_blocks[d].tail;
2143
300
      for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; 
k++150
)
2144
150
      {
2145
150
        const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
2146
150
        const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2147
150
        flag = (cell.i32 && 
cell.i32[0] > 00
);
2148
150
      }
2149
150
      if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2150
0
        _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2151
150
    }
2152
30
  }
2153
1.19k
  // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2154
7.70k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2155
7.70k
    int x, y;
2156
29.9k
    for (x = 0; x < node->input_size; 
x++22.2k
)
2157
69.8k
      
for (y = 0; 22.2k
y < node->output_size;
y++47.6k
)
2158
47.6k
        /* Some operations enforces some tensors to be the same for inputs / outputs. */
2159
47.6k
        if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2160
168
        {
2161
168
          // If both unassigned, it is fine.
2162
168
          if (node->inputs[x] < 0 && 
node->outputs[y] < 00
)
2163
0
            continue;
2164
168
          int ref = node->inputs[x];
2165
168
          assert(ref >= 0);
2166
168
          while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref0
)
2167
0
            ref = tensor_blocks[ref].ref - 1;
2168
168
          const int node_output_y = node->outputs[y];
2169
168
          assert(node_output_y >= 0);
2170
168
          // If both are not computable, it is fine, we don't need to enforce.
2171
168
          if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2172
168
            
!0
TENSOR_EXPECT_COMPUTABLE0
(tensor_blocks[node_output_y]))
2173
168
            
continue0
;
2174
168
          // Otherwise, enforce and error out if failed.
2175
168
          if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2176
0
            { assert(0 && "cannot enforce inplace for the two tensors"); }
2177
168
        }
2178
7.70k
  } ccv_nnc_graph_visit_endfor
2179
1.19k
  // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2180
1.19k
  // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2181
1.19k
  // that is not enforced in-place (because the tensor enforced in-place will be different than the
2182
1.19k
  // binding one).
2183
11.0k
  
for (i = 0; 1.19k
i < tensor_bind_size;
i++9.89k
)
2184
9.89k
  {
2185
9.89k
    const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2186
9.89k
    // If there is a tensor binded, then it is unassigned.
2187
9.89k
    if (resolved_symbol.d >= 0)
2188
9.89k
    {
2189
9.89k
      int d = resolved_symbol.d;
2190
9.89k
      // If it is unused, this is not an alias.
2191
10.0k
      while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d]) && 
tensor_blocks[d].ref2.14k
)
2192
130
        d = tensor_blocks[d].ref - 1;
2193
9.89k
      // Doesn't work if this is a loop carrying variable.
2194
9.89k
      assert(!tensor_symbol_info[d].assign_ref);
2195
9.89k
      tensor_blocks[d].flags = UNASSIGNED;
2196
9.89k
      tensor_blocks[d].ref = 0; // No need to have ref as well.
2197
9.89k
    }
2198
9.89k
  }
2199
1.19k
  // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2200
7.70k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2201
7.70k
    int x, y;
2202
29.9k
    for (x = 0; x < node->input_size; 
x++22.2k
)
2203
22.2k
    {
2204
22.2k
      /* If the input is not assigned, it can be referenced, find the referenced one */
2205
22.2k
      int ref = node->inputs[x];
2206
22.2k
      if (ref < 0)
2207
5.22k
        continue;
2208
18.8k
      
while (16.9k
!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
tensor_blocks[ref].ref9.81k
)
2209
1.80k
        ref = tensor_blocks[ref].ref - 1;
2210
16.9k
      assert(tensor_blocks[ref].ref == 0);
2211
16.9k
      const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2212
16.9k
      if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) &&
2213
16.9k
        
tensor_blocks[ref].tail->rnum == 18.99k
)
2214
25.1k
        
for (y = 0; 8.88k
y < node->output_size;
y++16.2k
)
2215
16.2k
          /* Only proceed if the input symbol is different from the output symbol, */
2216
16.2k
          /* and the input symbol meets the output symbol exactly at the same spot. */
2217
16.2k
          if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2218
16.2k
            
node->outputs[y] >= 03.18k
&&
2219
16.2k
            
ref != node->outputs[y]3.18k
&&
2220
16.2k
            
TENSOR_EXPECT_COMPUTABLE3.18k
(tensor_blocks[node->outputs[y]]))
2221
16.2k
          {
2222
2.03k
            const int node_output_y = node->outputs[y];
2223
2.03k
            const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2224
2.03k
            /* If dimension matches perfectly, then we can assign y_symbol to x. */
2225
2.03k
            if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
2226
2.00k
              _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2227
2.03k
          }
2228
16.9k
    }
2229
7.70k
  } ccv_nnc_graph_visit_endfor
2230
1.19k
  // Specifically handle the bypass. This need to be done after the first pass.
2231
1.19k
  // I need to extend the bypass life-time to the same as the one I am going with.
2232
1.19k
  // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2233
1.19k
  ccv_nnc_tensor_block_t empty_block = {};
2234
1.19k
  empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2235
1.19k
  empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2236
7.70k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2237
7.70k
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2238
13
    {
2239
13
      int can_bypass = 1;
2240
28
      for (i = 0; can_bypass && 
i < node->output_size25
;
i++15
)
2241
15
      {
2242
15
        int d = node->outputs[i];
2243
15
        if (d < 0)
2244
0
          continue;
2245
15
        if (!tensor_blocks[d].bypass_ref)
2246
2
          continue;
2247
13
        while (tensor_blocks[d].ref)
2248
0
          d = tensor_blocks[d].ref - 1;
2249
13
        int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2250
14
        while (tensor_blocks[bypass_ref].ref)
2251
1
          bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2252
13
        // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2253
13
        if (!tensor_symbol_info[bypass_ref].assign_ref && 
!tensor_symbol_info[bypass_ref].r_assign_ref10
)
2254
10
          continue;
2255
3
        ccv_array_clear(empty_block.head);
2256
6
        for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; 
j++3
)
2257
3
          ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j));
2258
3
        ccv_array_clear(empty_block.tail);
2259
6
        for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; 
j++3
)
2260
3
          ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j));
2261
6
        for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++3
)
2262
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), empty_block);
2263
6
        for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++3
)
2264
3
          _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), empty_block);
2265
3
        // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2266
3
        assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref));
2267
3
        int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : 
tensor_symbol_info[bypass_ref].r_assign_ref - 10
;
2268
3
        while (tensor_blocks[b_ref].ref)
2269
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2270
3
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2271
3
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2272
3
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2273
3
        // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2274
3
        can_bypass = can_bypass && (a_hop_b || b_hop_a);
2275
3
      }
2276
13
      if (can_bypass)
2277
10
      {
2278
22
        for (i = 0; i < node->output_size; 
i++12
)
2279
12
        {
2280
12
          int d = node->outputs[i];
2281
12
          if (d < 0)
2282
0
            continue;
2283
12
          if (!tensor_blocks[d].bypass_ref)
2284
2
            continue;
2285
10
          while (tensor_blocks[d].ref)
2286
0
            d = tensor_blocks[d].ref - 1;
2287
10
          int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2288
10
          while (tensor_blocks[bypass_ref].ref)
2289
0
            bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2290
10
          // The bypass_ref can extend its life-time.
2291
20
          for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; 
j++10
)
2292
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j), tensor_blocks[bypass_ref]);
2293
20
          for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; 
j++10
)
2294
10
            _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j), tensor_blocks[bypass_ref]);
2295
10
        }
2296
10
      } else {
2297
6
        for (i = 0; i < node->output_size; 
i++3
)
2298
3
          tensor_blocks[node->outputs[i]].bypass_ref = 0;
2299
3
        const int exec_idx = (dup_exec_from_ref) ? 
dup_exec_from_ref[idx]1
:
idx2
;
2300
3
        // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2301
3
        exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2302
3
      }
2303
13
    }
2304
7.70k
  } ccv_nnc_graph_visit_endfor
2305
1.19k
  ccv_array_free(empty_block.head);
2306
1.19k
  ccv_array_free(empty_block.tail);
2307
1.19k
  *r_exec_dep = exec_dep;
2308
1.19k
  *r_tensor_blocks = tensor_blocks;
2309
1.19k
}
2310
2311
static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2312
33
{
2313
33
  if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
cmd.cmd == CCV_NNC_GRAPH_BACKWARD30
)
2314
3
  {
2315
3
    ccv_nnc_cmd_t retval = cmd;
2316
3
    retval.cmd = CCV_NNC_NOOP;
2317
3
    return retval;
2318
3
  }
2319
30
  return cmd;
2320
30
}
2321
2322
static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2323
102
{
2324
102
  if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2325
47
  {
2326
47
    if (tensor_symbol_info[input].alias_ref)
2327
18
    {
2328
18
      const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2329
18
      assert(tensor_symbol_info[alias_ref].alias_ref == 0);
2330
18
      ccv_nnc_tensor_symbol_t tensor_symbol = {};
2331
18
      if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2332
6
      {
2333
6
        tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2334
6
        if (tensor_symbol_info[alias_ref].peer_ref)
2335
0
          ccv_nnc_tensor_symbol_set_peer(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2336
0
            .d = tensor_symbol_info[alias_ref].peer_ref - 1,
2337
0
            .graph = dup_graph->peer
2338
0
          });
2339
6
        ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2340
6
        dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2341
12
      } else {
2342
12
        tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2343
12
        tensor_symbol.graph = dup_graph;
2344
12
      }
2345
18
      ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].inc, tensor_symbol_info[input].info, 0);
2346
18
      if (tensor_symbol_info[input].peer_ref)
2347
0
        ccv_nnc_tensor_symbol_set_peer(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2348
0
          .d = tensor_symbol_info[input].peer_ref - 1,
2349
0
          .graph = dup_graph->peer
2350
0
        });
2351
18
      ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2352
18
      dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2353
29
    } else {
2354
29
      ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2355
29
      if (tensor_symbol_info[input].peer_ref)
2356
4
        ccv_nnc_tensor_symbol_set_peer(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2357
4
          .d = tensor_symbol_info[input].peer_ref - 1,
2358
4
          .graph = dup_graph->peer
2359
4
        });
2360
29
      ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2361
29
      dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2362
29
    }
2363
47
    if (tensor_symbol_info[input].bypass_ref)
2364
2
    {
2365
2
      const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2366
2
      assert(dup_bypass_ref >= 0);
2367
2
      ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count]);
2368
2
      symbol_info->bypass_ref = dup_bypass_ref + 1;
2369
2
    }
2370
47
  }
2371
102
  return (ccv_nnc_tensor_symbol_t) {
2372
102
    .d = dup_tensor_block_ref[input * unroll_count],
2373
102
    .graph = dup_graph,
2374
102
  };
2375
102
}
2376
2377
static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2378
72
{
2379
72
  int i;
2380
72
  if (dup_exec_ref[idx * unroll_count] < 0)
2381
44
  {
2382
44
    // Input has to come before output, because output could has a bypass reference to the input.
2383
116
    for (i = 0; i < node->input_size; 
i++72
)
2384
72
      max_inputs[i] = (node->inputs[i] >= 0) ? 
_ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i])71
:
(ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph }1
;
2385
75
    for (i = 0; i < node->output_size; 
i++31
)
2386
31
      max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : 
(ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph }0
;
2387
44
    ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2388
44
    dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2389
44
  }
2390
72
  return (ccv_nnc_graph_exec_symbol_t) {
2391
72
    .d = dup_exec_ref[idx * unroll_count],
2392
72
    .graph = dup_graph,
2393
72
  };
2394
72
}
2395
2396
static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2397
1.19k
{
2398
1.19k
  int i;
2399
21.6k
  for (i = 0; i < tensor_block_size; 
i++20.4k
)
2400
20.4k
  {
2401
20.4k
    if (tensor_blocks[i].head)
2402
16.1k
      ccv_array_free(tensor_blocks[i].head);
2403
20.4k
    if (tensor_blocks[i].tail)
2404
16.1k
      ccv_array_free(tensor_blocks[i].tail);
2405
20.4k
    if (tensor_blocks[i].r_refs)
2406
2.08k
      ccv_array_free(tensor_blocks[i].r_refs);
2407
20.4k
    if (tensor_blocks[i].dup_p_refs)
2408
22
      ccv_array_free(tensor_blocks[i].dup_p_refs);
2409
20.4k
  }
2410
1.19k
  ccfree(tensor_blocks);
2411
1.19k
}
2412
2413
// Find tensors that cannot be solved by co-allocating to the same location.
2414
static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2415
21
{
2416
21
  int i, j, unroll_count = 0;
2417
131
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++110
)
2418
110
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
tensor_symbol_info[i].assign_ref90
)
2419
25
    {
2420
25
      // This is is a parameter, thus, it has to be either an alias or used.
2421
25
      assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]));
2422
25
      const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2423
25
      // The parameter it assign to has to be either an alias or used.
2424
25
      assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]));
2425
25
      // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2426
25
      // If it is the same, we are good, no need to extend.
2427
25
      int a_ref = i;
2428
25
      while (tensor_blocks[a_ref].ref)
2429
0
        a_ref = tensor_blocks[a_ref].ref - 1;
2430
25
      int b_ref = assign_ref;
2431
31
      while (tensor_blocks[b_ref].ref)
2432
6
        b_ref = tensor_blocks[b_ref].ref - 1;
2433
25
      if (a_ref != b_ref)
2434
19
      {
2435
19
        // If any of the b's head is deterministically later than a's tail
2436
19
        // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2437
19
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2438
19
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2439
19
        // It cannot be that both i can hop to j can j can hop to i.
2440
19
        assert(!(a_hop_b > 0 && b_hop_a > 0));
2441
19
        // Can it be folded
2442
19
        // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2443
19
        if (a_hop_b || 
b_hop_a16
)
2444
3
        {
2445
3
          tensor_blocks[a_ref].companion_ref = b_ref + 1;
2446
3
          tensor_blocks[b_ref].companion_ref = a_ref + 1;
2447
3
          continue;
2448
3
        }
2449
16
        int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2450
20
        for (j = 0; c_ref >= 0; 
j++4
)
2451
4
        {
2452
4
          while (tensor_blocks[c_ref].ref)
2453
0
            c_ref = tensor_blocks[c_ref].ref - 1;
2454
4
          c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2455
4
        }
2456
16
        unroll_count = ccv_max(unroll_count, j + 1);
2457
16
      }
2458
25
    }
2459
21
  // Reset companion_ref if need to unroll.
2460
21
  if (unroll_count)
2461
91
    
for (j = 0; 13
j < symbolic_graph->tensor_symbol_info->rnum;
j++78
)
2462
78
      tensor_blocks[j].companion_ref = 0;
2463
21
  return unroll_count;
2464
21
}
2465
2466
static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2467
13
{
2468
13
  int i, j, n;
2469
13
  // The inout exec nodes, these are the nodes we are going to extend.
2470
13
  uint8_t* inout = (uint8_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2471
13
  int max_input_size = 0;
2472
13
  int max_output_size = 0;
2473
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2474
35
  {
2475
35
    max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size);
2476
35
    max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size);
2477
35
  }
2478
13
  ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)];
2479
13
  ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)];
2480
13
  // Doing graph expansion
2481
13
  // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2482
13
  assert(dup_graph->exec_symbol_info->rnum > 0);
2483
13
  assert(dup_graph->tensor_symbol_info->rnum > 0);
2484
88
#define INCOMING_NODE (1)
2485
28
#define OUTGOING_NODE (2)
2486
13
  // Unroll the graph n times.
2487
29
  
for (n = 0; 13
n < unroll_count;
n++16
)
2488
16
  {
2489
16
    int* const dup_exec_ref = r_dup_exec_ref + n;
2490
16
    const int* const prev_dup_tensor_block_ref = n > 0 ? 
r_dup_tensor_block_ref + (n - 1)3
:
013
;
2491
16
    int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2492
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2493
46
      dup_exec_ref[i * unroll_count] = -1;
2494
131
    for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++115
)
2495
115
    {
2496
115
      // If there is a assign_ref, that means I don't need to dup the tensor.
2497
115
      if (tensor_symbol_info[i].assign_ref)
2498
25
      {
2499
25
        const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2500
25
        dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? 
prev_dup_tensor_block_ref[assign_ref * unroll_count]8
:
assign_ref17
;
2501
90
      } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i]) && 
TENSOR_READ_WRITE52
(tensor_blocks[i]) == READ_ONLY52
)
2502
26
      // If this is a read-only tensor block, no need to duplicate because the value never changes
2503
26
      // (note we handled assign_ref first), therefore, no need to generate duplicate.
2504
26
        dup_tensor_block_ref[i * unroll_count] = i;
2505
64
      else
2506
64
        dup_tensor_block_ref[i * unroll_count] = -1;
2507
115
    }
2508
16
    // Go through the original graph, make copies of the node if it is inout.
2509
44
    ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2510
44
      ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2511
44
      inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2512
44
      if (!node->outgoings)
2513
16
        continue;
2514
56
      
for (i = 0; 28
i < node->outgoings->rnum;
i++28
)
2515
28
      {
2516
28
        const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
2517
28
        inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2518
28
        ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2519
28
        ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2520
28
      }
2521
28
    } ccv_nnc_graph_visit_endfor
2522
16
    // Check the visitor are all marked as either incoming or outgoing.
2523
16
    const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2524
16
    const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2525
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2526
46
    {
2527
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2528
46
        
continue2
;
2529
44
      assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE));
2530
44
      // If this is pure incoming nodes, then I need to concat this one with all original destination node
2531
44
      if (inout[i] == INCOMING_NODE)
2532
44
        
for (j = 0; 16
j < dup_destination_size32
;
j++16
)
2533
16
        {
2534
16
          ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2535
16
            .d = dup_destinations[j].d,
2536
16
            .graph = dup_graph,
2537
16
          }, (ccv_nnc_graph_exec_symbol_t) {
2538
16
            .d = dup_exec_ref[i * unroll_count],
2539
16
            .graph = dup_graph,
2540
16
          });
2541
16
        }
2542
44
    }
2543
16
    if (dup_graph->destinations)
2544
16
      ccv_array_clear(dup_graph->destinations);
2545
62
    for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++46
)
2546
46
    {
2547
46
      if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2548
46
        
continue2
;
2549
44
      const int d = dup_exec_ref[i * unroll_count];
2550
44
      ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d);
2551
44
      // If this has no outgoing node, add to the destination.
2552
44
      if (!exec_symbol_info->outgoings || 
exec_symbol_info->outgoings->rnum == 028
)
2553
16
        ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2554
16
          .graph = dup_graph,
2555
16
          .d = d,
2556
16
        });
2557
44
    }
2558
16
  }
2559
13
#undef INCOMING_NODE
2560
13
#undef OUTGOING_NODE
2561
13
  ccfree(inout);
2562
13
}
2563
2564
static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2565
13
{
2566
13
  int i;
2567
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2568
78
    // Now can assign them (The dup) as companion.
2569
78
    // Get to the last one, which we will wrap over.
2570
78
    if (dup_tensor_symbol_info[i].assign_ref)
2571
17
    {
2572
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2573
17
      dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2574
17
      assert(dup_tensor_symbol_info[i].assign_ref);
2575
17
      dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2576
17
    }
2577
13
}
2578
2579
// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2580
// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2581
// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2582
static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2583
21
{
2584
21
  int i, j, k;
2585
45
  for (i = 0; i < p_node_info->output_size; 
i++24
)
2586
24
  {
2587
24
    const int d = p_node_info->outputs[i];
2588
24
    const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx) - 1;
2589
24
    if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref]))
2590
24
      
continue6
;
2591
36
    
for (k = 0; 18
k < destination_size;
k++18
)
2592
18
      _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2593
18
    // Add the duplicated destinations to the tensor_block_ref.
2594
42
    for (j = 0; j < unroll_count; 
j++24
)
2595
48
      
for (k = 0; 24
k < destination_size;
k++24
)
2596
24
      {
2597
24
        const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2598
24
        const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2599
24
        if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2600
24
          _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2601
24
      }
2602
18
  }
2603
21
}
2604
2605
static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2606
21
{
2607
21
  int i, j;
2608
21
  ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2609
21
  ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2610
21
  // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2611
21
  // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2612
21
  // No need to change anything, we are good.
2613
21
  const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2614
21
  if (!unroll_count)
2615
8
    return;
2616
13
  // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2617
13
  // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2618
13
  ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2619
13
  int* dup_exec_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2620
13
  int* dup_tensor_block_ref = (int*)ccmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2621
13
  _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2622
13
  ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2623
13
  ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2624
26
  ccv_nnc_graph_visit_t* dup_visit = 
ccv_nnc_graph_visit_new13
(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0);
2625
26
  ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2626
26
  _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2627
26
  // Free out the old exec_dep
2628
26
  ccv_matrix_free(exec_dep);
2629
26
  // and the tensor blocks, prepare for the new.
2630
26
  _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2631
26
  // A reverse map to find where the original tensor comes from.
2632
26
  int* dup_tensor_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2633
142
  for (i = 0; i < dup_graph->tensor_symbol_info->rnum; 
i++129
)
2634
129
    dup_tensor_from_ref[i] = -1;
2635
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2636
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2637
115
      if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2638
104
        dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2639
26
  int* dup_exec_from_ref = (int*)
ccmalloc13
(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2640
90
  for (i = 0; i < dup_graph->exec_symbol_info->rnum; 
i++77
)
2641
77
    dup_exec_from_ref[i] = -1;
2642
48
  for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; 
i++35
)
2643
35
  {
2644
35
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags))
2645
35
      
continue2
;
2646
33
    dup_exec_from_ref[i] = i; // Reference back.
2647
77
    for (j = 0; j < unroll_count; 
j++44
)
2648
44
      if (dup_exec_ref[i * unroll_count + j] >= 0)
2649
44
        dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2650
33
  }
2651
26
  // Reset all attr.
2652
26
  memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2653
26
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)
ccv_array_get13
(dup_graph->destinations, 0), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2654
26
  ccv_nnc_graph_visit_free(dup_visit);
2655
26
  
ccfree13
(dup_exec_symbol_info);
2656
26
  
ccfree13
(dup_exec_from_ref);
2657
26
  
ccfree13
(dup_tensor_from_ref);
2658
26
  // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2659
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2660
78
    // Loop over all possible duplications to assign dup_p_ref properly.
2661
193
    
for (j = 0; 78
j < unroll_count;
j++115
)
2662
115
    {
2663
115
      const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2664
115
      if (dup_idx >= 0 && 
(104
tensor_blocks[i].p_refs[0]104
||
tensor_blocks[i].p_refs[1]60
))
2665
44
      {
2666
44
        const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2667
44
        const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2668
44
        if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2669
28
        {
2670
28
          if (!tensor_blocks[dup_idx].dup_p_refs)
2671
22
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2672
28
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2673
28
        }
2674
44
        if (p_ref_0_is_in_or_out == 1 || 
tensor_blocks[i].p_refs[1] == 016
)
2675
44
          continue;
2676
0
        const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2677
0
        const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2678
0
        if (p_ref_1_is_in_or_out == 1)
2679
0
        {
2680
0
          if (!tensor_blocks[dup_idx].dup_p_refs)
2681
0
            tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2682
0
          ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2683
0
        }
2684
0
      }
2685
115
    }
2686
26
  // companion_ref
2687
91
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++78
)
2688
78
    // Now can assign them (The dup) as companion.
2689
78
    if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i]) && 
dup_tensor_symbol_info[i].assign_ref71
)
2690
17
    {
2691
17
      // Get to the last one, which we will wrap over.
2692
17
      const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2693
17
      if (assign_ref >= 0)
2694
17
      {
2695
17
        int b_ref = assign_ref;
2696
17
        while (tensor_blocks[b_ref].ref)
2697
0
          b_ref = tensor_blocks[b_ref].ref - 1;
2698
17
        int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2699
17
        int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2700
17
        // It cannot be that both i can hop to j can j can hop to i.
2701
17
        // And it can be hop from one to another now after duplication.
2702
17
        assert(a_hop_b > 0 || b_hop_a > 0);
2703
17
        tensor_blocks[i].companion_ref = b_ref + 1;
2704
17
        tensor_blocks[b_ref].companion_ref = i + 1;
2705
17
      }
2706
17
    }
2707
26
  
ccfree13
(dup_tensor_symbol_info);
2708
13
  // Extend the dup tensor block ref, prepare for future extensions.
2709
13
  dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2710
110
  for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; 
i++97
)
2711
97
    dup_tensor_block_ref[i] = -1;
2712
13
  // Assign out changed properties.
2713
13
  *r_exec_dep = exec_dep;
2714
13
  *r_tensor_blocks = tensor_blocks;
2715
13
  *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2716
13
  *r_dup_graph = dup_graph;
2717
13
  *r_unroll_count = unroll_count;
2718
13
  *r_dup_exec_ref = dup_exec_ref;
2719
13
  *r_dup_tensor_block_ref = dup_tensor_block_ref;
2720
13
}
2721
2722
static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2723
32
{
2724
32
  if (!anonymous_block_free_list || 
!anonymous_block_free_list_cap16
)
2725
29
    return tensor_block_size;
2726
3
  int i;
2727
3
  const int no_dup_p_refs = (!dup_p_refs || 
!dup_p_refs->rnum0
);
2728
3
  int found_idx = tensor_block_size;
2729
9
  for (i = 0; i < anonymous_block_free_list_cap; 
i++6
)
2730
7
  {
2731
7
    const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i);
2732
7
    assert(idx < tensor_block_size);
2733
7
    // If the type doesn't match, ignore.
2734
7
    if (tensor_blocks[idx].type != type)
2735
0
      continue;
2736
7
    // Heuristic about how to select the best tensor block to move forward.
2737
7
    // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
2738
7
    if (tensor_blocks[idx].size >= size)
2739
1
    {
2740
1
      if (no_dup_p_refs)
2741
1
        return idx;
2742
0
      // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
2743
0
      // then we cannot do better than this, if that is the case, just return.
2744
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
2745
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
2746
0
        return idx;
2747
6
    }
2748
6
    int64_t found_idx_size_diff;
2749
6
    int64_t idx_size_diff;
2750
6
    if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
2751
6
      // Now, compare whether this one or the found_idx one is better.
2752
6
      // At this point, there is no point of comparing the dup_p_refs, we only care about which one
2753
6
      // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
2754
6
      
(found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size))4
)
2755
3
    {
2756
3
      found_idx = idx;
2757
3
      continue;
2758
3
    }
2759
3
    // No need to update if found_idx is better than idx.
2760
3
    if (found_idx_size_diff > idx_size_diff)
2761
0
      continue;
2762
3
    // We bias towards the bigger one in case of similar.
2763
3
    if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
2764
0
    {
2765
0
      found_idx = idx;
2766
0
      continue;
2767
0
    }
2768
3
    assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size);
2769
3
    // On a tie, check which one has tighter life-cycle.
2770
3
    if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
2771
0
    {
2772
0
      // Check whether the current tensor blocks life-cycle is longer than the previous one.
2773
0
      if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
2774
0
        (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
2775
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
2776
0
        found_idx = idx;
2777
0
      continue;
2778
0
    }
2779
3
    // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
2780
3
    // We prefer to choose the one that has life-cycle closer to the expected ones.
2781
3
    if (no_dup_p_refs)
2782
3
    {
2783
3
      // Whoever is shorter wins.
2784
3
      if (tensor_blocks[found_idx].dup_p_refs && 
tensor_blocks[found_idx].dup_p_refs->rnum > 00
&&
2785
3
        
(0
!tensor_blocks[idx].dup_p_refs0
||
!tensor_blocks[idx].dup_p_refs->rnum0
||
2786
0
         _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
2787
0
        found_idx = idx;
2788
3
      continue;
2789
3
    }
2790
0
    if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
2791
0
      continue;
2792
0
    if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
2793
0
    {
2794
0
      found_idx = idx;
2795
0
      continue;
2796
0
    }
2797
0
    // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
2798
0
    const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
2799
0
    const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
2800
0
    if (idx_after_request && found_idx_after_request)
2801
0
    {
2802
0
      if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
2803
0
        found_idx = idx;
2804
0
      continue;
2805
0
    } else {
2806
0
      // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
2807
0
      // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
2808
0
      // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
2809
0
      if (!found_idx_after_request && (idx_after_request ||
2810
0
        _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
2811
0
        found_idx = idx;
2812
0
      continue;
2813
0
    }
2814
0
  }
2815
3
  
return found_idx2
;
2816
3
}
2817
2818
static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
2819
49
{
2820
49
  if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
2821
28
    return 0;
2822
21
  int i, j, k;
2823
21
  int input_size = 0;
2824
43
  for (i = 0; i < p_node_info->p_while.input_size; 
i++22
)
2825
22
    if (p_node_info->p_while.inputs[i] >= 0)
2826
2
      ++input_size;
2827
21
  // If doesn't have tensor inputs (thus, only special inputs), just return.
2828
21
  if (!input_size)
2829
19
    return 0;
2830
2
  ccv_nnc_tensor_symbol_t inputs[input_size];
2831
2
  input_size = 0;
2832
6
  for (i = 0; i < p_node_info->p_while.input_size; 
i++4
)
2833
4
    if (p_node_info->p_while.inputs[i] >= 0)
2834
2
      inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
2835
2
        .d = p_node_info->p_while.inputs[i],
2836
2
        .graph = symbolic_graph,
2837
2
      };
2838
2
  assert(symbolic_graph->breakpoint_size > 0);
2839
2
  ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
2840
2
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
2841
4
  for (i = 0; i < symbolic_graph->breakpoint_size; 
i++2
)
2842
2
  {
2843
2
    // Make a noop copy of the breakpoint, but with some tensor inputs.
2844
2
    ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), inputs, input_size, 0, 0, 0);
2845
2
    ccv_array_push(dup_breakpoints, &noop);
2846
2
    // Connect this noop to the outgoing nodes of breakpoints.
2847
2
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d);
2848
2
    if (symbol_info->outgoings)
2849
4
      
for (j = 0; 2
j < symbol_info->outgoings->rnum;
j++2
)
2850
2
      {
2851
2
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
2852
2
        ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
2853
2
          .d = d,
2854
2
          .graph = symbolic_graph,
2855
2
        });
2856
2
      }
2857
2
  }
2858
7
  for (i = 0; i < exec_symbol_info_size; 
i++5
)
2859
5
  {
2860
5
    const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i);
2861
5
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags))
2862
5
      
continue0
;
2863
5
    if (symbol_info->outgoings)
2864
3
    {
2865
3
      const int outgoing_size = symbol_info->outgoings->rnum;
2866
6
      for (j = 0; j < outgoing_size; 
j++3
)
2867
3
      {
2868
3
        const int d = *(int*)ccv_array_get(symbol_info->outgoings, j);
2869
6
        for (k = 0; k < symbolic_graph->breakpoint_size; 
k++3
)
2870
3
          if (d == symbolic_graph->breakpoints[k].d)
2871
0
          {
2872
0
            ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k);
2873
0
            ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
2874
0
              .d = i,
2875
0
              .graph = symbolic_graph,
2876
0
            }, noop);
2877
0
            // Found, connected, exit.
2878
0
            break;
2879
0
          }
2880
3
      }
2881
3
    }
2882
5
  }
2883
2
  // Add the dup_breakpoints to source if neccessary.
2884
2
  assert(symbolic_graph->sources);
2885
2
  const int source_size = symbolic_graph->sources->rnum;
2886
4
  for (i = 0; i < source_size; 
i++2
)
2887
2
  {
2888
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i))->d;
2889
2
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++0
)
2890
2
      if (d == symbolic_graph->breakpoints[j].d)
2891
2
      {
2892
2
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
2893
2
        ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
2894
2
        // Found, made, exit.
2895
2
        break;
2896
2
      }
2897
2
  }
2898
2
  // Add the dup_breakpoints to destination if neccessary.
2899
2
  assert(symbolic_graph->destinations);
2900
2
  const int destination_size = symbolic_graph->destinations->rnum;
2901
4
  for (i = 0; i < destination_size; 
i++2
)
2902
2
  {
2903
2
    const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i))->d;
2904
4
    for (j = 0; j < symbolic_graph->breakpoint_size; 
j++2
)
2905
2
      if (d == symbolic_graph->breakpoints[j].d)
2906
0
      {
2907
0
        ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j);
2908
0
        ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
2909
0
        // Found, made, exit.
2910
0
        break;
2911
0
      }
2912
2
  }
2913
2
  return dup_breakpoints;
2914
2
}
2915
2916
// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
2917
static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
2918
1.17k
{
2919
1.17k
  assert(source_size > 0);
2920
1.17k
  assert(destination_size > 0);
2921
1.17k
  // First, fill all the "auto" holes.
2922
1.17k
  // This is the symbol table that with "auto" info filled up.
2923
1.17k
  ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
2924
1.17k
  ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
2925
1.17k
  ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
2926
2.35k
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new1.17k
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
2927
2.35k
  ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
2928
2.35k
  int i, j, k, p, q;
2929
2.35k
  const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = p_exec_symbol_info ? 
p_exec_symbol_info + (symbolic_graph->exec_idx - 1)49
:
01.12k
;
2930
2.35k
  ccv_sparse_matrix_t* exec_dep;
2931
2.35k
  ccv_nnc_tensor_block_t* tensor_blocks;
2932
2.35k
  _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
2933
2.35k
  int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2934
2.35k
  // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
2935
2.35k
  // are automatically filled in, and all the sub-graphs are processed.
2936
2.35k
  // There is a last step though, for a while loop, it is parameterized:
2937
2.35k
  // while (x > 5) {
2938
2.35k
  //     y = x + 1;
2939
2.35k
  // } (y => x) // This means after this loop is done, y's value will be copied over to x.
2940
2.35k
  // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
2941
2.35k
  // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
2942
2.35k
  // it is a inplace operation.
2943
2.35k
  // But if y cannot be x's alias, for example, this while loop looks like this:
2944
2.35k
  // while (x > 5) {
2945
2.35k
  //     y = x + a
2946
2.35k
  //     b = x + y
2947
2.35k
  // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
2948
2.35k
  // For this example, y cannot be x's alias because x is used later to compute b (and that computation
2949
2.35k
  // has dependency on y as well).
2950
2.35k
  // For this case, we need to modify the computation graph. Previously, the graph looks like this:
2951
2.35k
  // y = x + a -> b = x + y
2952
2.35k
  // This graph will be extended to look like this:
2953
2.35k
  // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
2954
2.35k
  // while (x0 > 5) {
2955
2.35k
  //     y0 = x0 + a0
2956
2.35k
  //     b0 = x0 + y0
2957
2.35k
  //     if (y0 > 5) break
2958
2.35k
  //     y1 = y0 + b0
2959
2.35k
  //     b1 = y0 + y1
2960
2.35k
  // } (y1 => x0, b1 => a0)
2961
2.35k
  // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
2962
2.35k
  // with each other now).
2963
2.35k
  // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
2964
2.35k
  // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
2965
2.35k
  ccv_nnc_symbolic_graph_t* dup_graph = 0;
2966
2.35k
  int* dup_exec_ref = 0;
2967
2.35k
  int* dup_tensor_block_ref = 0;
2968
2.35k
  int unroll_count = 0;
2969
2.35k
  // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
2970
2.35k
  ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)
ccmalloc1.17k
(sizeof(ccv_nnc_symbolic_graph_prep_t));
2971
2.35k
  prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
2972
2.35k
  prep->flags = 0;
2973
2.35k
  // Cannot handle dup a node that is a graph as well.
2974
2.35k
  if (
p_exec_symbol_info1.17k
)
2975
49
  {
2976
49
    prep->flags = p_node_info->flags;
2977
49
    if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
2978
21
    {
2979
21
      _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
2980
21
      _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
2981
28
    } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
2982
28
      // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
2983
28
    }
2984
49
  }
2985
2.35k
  ccv_nnc_symbolic_graph_prep_t** sub_preps = 
symbolic_graph->sub_graphs1.17k
&&
symbolic_graph->sub_graphs->rnum29
?
(ccv_nnc_symbolic_graph_prep_t**)29
cccalloc29
(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) :
01.14k
;
2986
2.35k
  ccv_array_t* anonymous_block_free_list = 0;
2987
2.35k
  const int tensor_fold_size = (tensor_block_size + 31) >> 5;
2988
2.35k
  // Record whether this tensor is folded in this round.
2989
2.35k
  uint32_t* const tensor_fold = (uint32_t*)
ccmalloc1.17k
(sizeof(uint32_t) * tensor_fold_size);
2990
7.63k
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
2991
7.67k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
2992
49
    {
2993
49
      assert(symbolic_graph->sub_graphs);
2994
49
      ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1);
2995
49
      ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
2996
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
2997
49
      sub_prep->dup_breakpoints = dup_breakpoints;
2998
49
      sub_prep->p = prep;
2999
49
      sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1] = sub_prep;
3000
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3001
49
      const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3002
296
      for (i = 0; i < s_alloc_prep->block_size; 
i++247
)
3003
247
      {
3004
247
        const int block_ref = s_alloc_prep->blocks[i].block_ref;
3005
247
        const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3006
247
        if (block_ref < sub_prep->tensor_symbol_info_size)
3007
192
        {
3008
192
          // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3009
192
          // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3010
192
          if (s_tensor_blocks[block_ref].bypass_ref)
3011
1
          {
3012
1
            int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3013
1
            while (s_tensor_blocks[bypass_ref].ref)
3014
0
              bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3015
1
            if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3016
1
              
s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1]0
)
3017
1
              continue;
3018
191
          }
3019
191
          if (s_tensor_blocks[block_ref].p_refs[0])
3020
91
          {
3021
91
            /* If it is already properly assigned, next. */
3022
91
            if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3023
91
              s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3024
91
            {
3025
91
              if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3026
90
                s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3027
1
              else {
3028
1
                assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3029
1
                s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3030
1
              }
3031
91
            }
3032
91
            /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3033
91
            if (s_tensor_blocks[block_ref].p_refs[1] &&
3034
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1]3
&&
3035
91
              
s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1]3
)
3036
3
            {
3037
3
              assert(s_alloc_prep->buffers[buffer_ref].p_refs[0]);
3038
3
              assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1]);
3039
3
              s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3040
3
            }
3041
91
          }
3042
191
        } else 
if (55
s_tensor_blocks[block_ref].dup_p_refs55
) {
3043
15
          /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3044
15
           * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3045
15
           * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3046
15
           * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3047
15
           * its life-time to the end of the output tensor. */
3048
15
          if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3049
13
            s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3050
33
          for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; 
j++18
)
3051
18
            ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j));
3052
15
        }
3053
247
      }
3054
49
    }
3055
7.63k
    const int init_tensor_block_size = tensor_block_size;
3056
7.63k
    int rw_anonymous_buffer_size_cap = 0;
3057
7.63k
    int ro_anonymous_buffer_size_cap = 0;
3058
7.63k
    if (anonymous_block_free_list)
3059
17
      ccv_array_clear(anonymous_block_free_list);
3060
7.63k
    memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3061
7.67k
    for (p = 0; p < node->graph_ref_size; 
p++49
)
3062
49
    {
3063
49
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)[p] - 1];
3064
49
      const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3065
49
      int rw_anonymous_buffer_size = 0;
3066
49
      int ro_anonymous_buffer_size = 0;
3067
230
      for (i = 0; i < s_alloc_prep->buffer_size; 
i++181
)
3068
181
        if (s_alloc_prep->buffers[i].p_refs[0])
3069
90
        {
3070
90
          /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3071
90
          int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3072
90
          /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3073
90
          int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3074
90
          assert(p_ref_0_is_in_or_out != 0);
3075
90
          int unref_p_ref_0 = p_ref_0;
3076
92
          while (tensor_blocks[unref_p_ref_0].ref)
3077
2
            unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3078
90
          /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3079
90
          assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3080
90
          if (s_alloc_prep->buffers[i].p_refs[1])
3081
4
          {
3082
4
            int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3083
4
            const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3084
4
            assert(p_ref_1_is_in_or_out != 0);
3085
4
            int unref_p_ref_1 = p_ref_1;
3086
4
            while (tensor_blocks[unref_p_ref_1].ref)
3087
0
              unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3088
4
            /* See above comment for the similar p_ref_0 check. */
3089
4
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]));
3090
4
            assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out);
3091
4
            int p_ref_t;
3092
4
            if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3093
3
            {
3094
3
              CCV_SWAP(p_ref_0, p_ref_1, p_ref_t);
3095
3
              CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t);
3096
3
            }
3097
4
            p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3098
4
            /* If the dimension matches, can fold. */
3099
4
            if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
3100
4
            {
3101
4
              const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3102
4
              if (folded)
3103
1
              {
3104
1
                p_ref_0 = p_ref_1;
3105
1
                unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3106
1
                tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3107
1
                for (j = 0; j < unroll_count; 
j++0
) /* Fold its duplicates as well. */
3108
0
                {
3109
0
                  const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3110
0
                  assert(folded && "the subsequent duplicates can be folded too.");
3111
0
                }
3112
1
              }
3113
4
            }
3114
4
          }
3115
90
          /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3116
90
           * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3117
90
           * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3118
90
           * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3119
90
           * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3120
90
           * associated with it, then we are good. */
3121
90
          if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3122
90
            
(89
p_ref_0_is_in_or_out == 189
&&
_ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)50
) ||
3123
90
            
(39
p_ref_0_is_in_or_out == -139
&&
_ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)39
) ||
3124
90
            
TENSOR_READ_WRITE8
(s_alloc_prep->buffers[i]) == READ_ONLY8
)
3125
86
          {
3126
86
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3127
27
              { assert(s_alloc_prep->buffers[i].p_refs[1] == 0); }
3128
86
            /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3129
86
             * is a long argument why that is the case, the digest is, it is much easier to control your output
3130
86
             * than your input). */
3131
86
            s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3132
86
            s_alloc_prep->buffers[i].p_refs[1] = 0;
3133
86
            /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3134
86
            assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]));
3135
86
            tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size);
3136
95
            for (j = 0; j < unroll_count; 
j++9
) /* Change the size of its duplicates as well. */
3137
9
              tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3138
9
                tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3139
9
                  tensor_blocks[unref_p_ref_0].size;
3140
86
          } else {
3141
4
            s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3142
4
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3143
0
              ++ro_anonymous_buffer_size;
3144
4
            else
3145
4
              ++rw_anonymous_buffer_size;
3146
4
          }
3147
91
        } else {
3148
91
          if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY)
3149
63
            ++ro_anonymous_buffer_size;
3150
28
          else
3151
28
            ++rw_anonymous_buffer_size;
3152
91
        }
3153
49
      if (ro_anonymous_buffer_size || 
rw_anonymous_buffer_size24
)
3154
28
      {
3155
28
        const int anonymous_block_free_list_cap = anonymous_block_free_list ? 
anonymous_block_free_list->rnum6
:
022
;
3156
28
        // All read-write buffer (potentially) can be reused between each case..of branch.
3157
28
        rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3158
28
        // Read-only buffer cannot be reused between each case..of branch.
3159
28
        ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3160
28
        /* Anonymous block, allocate additional tensor blocks for this. */
3161
28
        /* This is either because this is an internal tensor (don't have p_ref) */
3162
28
        /* or it is an anonymous block itself within the sub graphs of this while graph. */
3163
28
        tensor_blocks = (ccv_nnc_tensor_block_t*)ccrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3164
28
        memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3165
28
        if (dup_tensor_block_ref)
3166
3
          dup_tensor_block_ref = (int*)ccrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3167
175
        for (i = 0; i < s_alloc_prep->buffer_size; 
i++147
)
3168
147
          if (!s_alloc_prep->buffers[i].p_refs[0])
3169
95
          {
3170
95
            if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i]) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3171
63
            {
3172
63
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size]);
3173
63
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3174
63
              tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3175
63
              tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3176
63
              tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3177
63
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3178
63
              tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3179
63
              ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3180
63
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3181
63
              if (dup_p_refs && 
dup_p_refs->rnum > 00
)
3182
0
              {
3183
0
                for (j = 0; j < dup_p_refs->rnum; j++)
3184
0
                {
3185
0
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3186
0
                  assert(dup_p_ref >= 0);
3187
0
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3188
0
                  assert(tensor_blocks[dup_p_ref].tail);
3189
0
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3190
0
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3191
0
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3192
0
                  {
3193
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3194
0
                    assert(p_node_info);
3195
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3196
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3197
0
                    {
3198
0
                      if (!tensor_blocks[tensor_block_size].dup_p_refs)
3199
0
                        tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3200
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3201
0
                    }
3202
0
                  }
3203
0
                  if (!tensor_blocks[tensor_block_size].tail)
3204
0
                    tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3205
0
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3206
0
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_size]);
3207
0
                }
3208
63
              } else {
3209
63
                tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3210
63
                ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3211
63
              }
3212
132
              
for (j = 0; 63
j < source_size;
j++69
)
3213
69
                _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3214
63
              /* If this is a read-only (based on SSA, if first encountered as read), and this is
3215
63
               * sub-graph. Mark it to the end of the graph. */
3216
63
              if (p_exec_symbol_info)
3217
12
                
for (j = 0; 6
j < destination_size;
j++6
)
3218
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3219
63
              /* If it is read-only, it is self-reflecting. */
3220
69
              for (k = 0; k < unroll_count; 
k++6
)
3221
6
              {
3222
12
                for (j = 0; j < destination_size; 
j++6
)
3223
6
                  if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3224
6
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3225
6
                /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3226
6
                assert(symbolic_graph->p);
3227
6
                dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3228
6
              }
3229
63
              ++tensor_block_size;
3230
63
            } else {
3231
32
              ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3232
32
              const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3233
32
              const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3234
32
              // Find suitable tensor block from the free list.
3235
32
              TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3236
32
              TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3237
32
              s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3238
32
              if (new_anonymous_tensor_block)
3239
29
              {
3240
29
                tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3241
29
                tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3242
29
                tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3243
29
                tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3244
29
                ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3245
29
              } else {
3246
3
                tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3247
3
                tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3248
3
              }
3249
32
              if (dup_p_refs && 
dup_p_refs->rnum > 04
)
3250
4
              {
3251
8
                for (j = 0; j < dup_p_refs->rnum; 
j++4
)
3252
4
                {
3253
4
                  const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3254
4
                  assert(dup_p_ref >= 0);
3255
4
                  assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3256
4
                  // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3257
4
                  // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3258
4
                  if (tensor_symbol_info[dup_p_ref].p_ref)
3259
0
                  {
3260
0
                    const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3261
0
                    assert(p_node_info);
3262
0
                    const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3263
0
                    if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3264
0
                    {
3265
0
                      if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3266
0
                        tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3267
0
                      ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3268
0
                    }
3269
0
                  }
3270
4
                  assert(tensor_blocks[dup_p_ref].tail);
3271
4
                  if (!tensor_blocks[tensor_block_idx].tail)
3272
4
                    tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3273
8
                  for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; 
k++4
)
3274
4
                    _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k), tensor_blocks[tensor_block_idx]);
3275
4
                    // We have to add it to the warp around companion_ref as well.
3276
4
                    // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3277
4
                    // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3278
4
                    // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3279
4
                    // gaurantee may be broken down in the line.
3280
4
                    if (tensor_blocks[dup_p_ref].companion_ref)
3281
0
                    {
3282
0
                      const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3283
0
                      for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3284
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3285
0
                      for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3286
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3287
0
                    }
3288
4
                }
3289
28
              } else if (new_anonymous_tensor_block) {
3290
25
                tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3291
25
                ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3292
25
              }
3293
32
              const int prev_tensor_block_idx = tensor_block_idx;
3294
32
              if (new_anonymous_tensor_block)
3295
29
              {
3296
29
                if (!anonymous_block_free_list)
3297
16
                  anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3298
29
                ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3299
29
                ++tensor_block_size;
3300
29
              }
3301
33
              for (k = 0; k < unroll_count; 
k++1
)
3302
1
              {
3303
1
                const int tensor_block_idx = new_anonymous_tensor_block ?
3304
1
                  (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3305
1
                  
dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k]0
;
3306
1
                TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx]);
3307
1
                TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]));
3308
1
                if (new_anonymous_tensor_block)
3309
1
                {
3310
1
                  tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3311
1
                  tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3312
1
                  tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3313
1
                  tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3314
1
                  /* Attach to duplicated exec for this tensor block. */
3315
1
                  ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3316
1
                } else {
3317
0
                  tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3318
0
                  tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size);
3319
0
                  _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3320
0
3321
0
                }
3322
1
                if (dup_p_refs && dup_p_refs->rnum > 0)
3323
1
                {
3324
1
                  /* Not nil, not self-reflecting. */
3325
2
                  for (j = 0; j < dup_p_refs->rnum; 
j++1
)
3326
1
                  {
3327
1
                    const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j);
3328
1
                    assert(dup_p_ref >= 0);
3329
1
                    assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum);
3330
1
                    // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3331
1
                    // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3332
1
                    if (tensor_symbol_info[dup_p_ref].p_ref)
3333
0
                    {
3334
0
                      const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3335
0
                      assert(p_node_info);
3336
0
                      const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3337
0
                      if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3338
0
                      {
3339
0
                        if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3340
0
                          tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3341
0
                        ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3342
0
                      }
3343
0
                    }
3344
1
                    assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref);
3345
1
                    const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3346
1
                    assert(tensor_blocks[dup_dup_p_ref].tail);
3347
1
                    if (!tensor_blocks[tensor_block_idx].tail)
3348
1
                      tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3349
2
                    for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; 
q++1
)
3350
1
                      _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q), tensor_blocks[tensor_block_idx]);
3351
1
                    // We have to add it to the warp around companion_ref as well.
3352
1
                    if (tensor_blocks[dup_dup_p_ref].companion_ref)
3353
0
                    {
3354
0
                      const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3355
0
                      for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3356
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q), tensor_blocks[tensor_block_idx]);
3357
0
                      for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3358
0
                        _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q), tensor_blocks[tensor_block_idx]);
3359
0
                    }
3360
1
                  }
3361
1
                } else 
if (0
new_anonymous_tensor_block0
) {
3362
0
                  tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3363
0
                  ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3364
0
                }
3365
1
                if (new_anonymous_tensor_block)
3366
1
                  ++tensor_block_size;
3367
1
              }
3368
32
            }
3369
95
          }
3370
28
      }
3371
49
    }
3372
7.63k
  } ccv_nnc_graph_visit_endfor
3373
2.35k
  
if (1.17k
anonymous_block_free_list1.17k
)
3374
16
    ccv_array_free(anonymous_block_free_list);
3375
1.17k
  ccfree(tensor_fold);
3376
1.17k
  // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3377
1.17k
  // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3378
1.17k
  ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new(exec_dep, tensor_blocks, tensor_block_size);
3379
1.17k
  ccv_matrix_free(exec_dep);
3380
1.17k
  prep->while_count_tensor = 0;
3381
1.17k
  prep->dup_breakpoints = 0;
3382
1.17k
  prep->p = 0;
3383
1.17k
  prep->symbolic_graph = symbolic_graph;
3384
1.17k
  prep->p_idx = symbolic_graph->p_idx;
3385
1.17k
  prep->exec_idx = symbolic_graph->exec_idx;
3386
1.17k
  prep->sub_prep_size = symbolic_graph->sub_graphs ? 
symbolic_graph->sub_graphs->rnum29
:
01.14k
;
3387
1.17k
  prep->sub_preps = sub_preps;
3388
1.17k
  prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3389
1.17k
  prep->exec_symbol_info = exec_symbol_info;
3390
1.17k
  prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3391
1.17k
  prep->tensor_symbol_info = tensor_symbol_info;
3392
1.17k
  prep->unroll_count = unroll_count;
3393
1.17k
  prep->dup_tensor_block_ref = dup_tensor_block_ref;
3394
1.17k
  prep->tensor_block_size = tensor_block_size;
3395
1.17k
  prep->tensor_blocks = tensor_blocks;
3396
1.17k
  prep->exec_flags = exec_flags;
3397
1.17k
  prep->visit = visit;
3398
1.17k
  prep->alloc_prep = alloc_prep;
3399
1.17k
  if (dup_graph)
3400
13
    ccv_nnc_symbolic_graph_free(dup_graph);
3401
1.17k
  if (dup_exec_ref)
3402
1.17k
    
ccfree13
(dup_exec_ref)13
;
3403
1.17k
  return prep;
3404
2.35k
}
3405
3406
static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3407
1.17k
{
3408
1.17k
  int i;
3409
1.17k
  _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3410
1.17k
  ccfree(prep->exec_flags);
3411
1.22k
  for (i = 0; i < prep->sub_prep_size; 
i++50
)
3412
50
    if (prep->sub_preps[i])
3413
49
      _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3414
1.17k
  if (prep->sub_preps)
3415
1.17k
    
ccfree29
(prep->sub_preps)29
;
3416
1.17k
  ccfree(prep->tensor_symbol_info);
3417
1.17k
  ccfree(prep->exec_symbol_info);
3418
1.17k
  if (prep->dup_tensor_block_ref)
3419
1.17k
    
ccfree13
(prep->dup_tensor_block_ref)13
;
3420
1.17k
  _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3421
1.17k
  ccv_nnc_graph_visit_free(prep->visit);
3422
1.17k
  ccfree(prep);
3423
1.17k
}
3424
3425
static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3426
1.17k
{
3427
1.17k
  int i, j;
3428
7.63k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
3429
7.63k
    if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3430
21
    {
3431
21
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3432
21
      assert(graph_ref >= 0);
3433
21
      ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3434
43
      for (i = 0; i < node->p_while.input_size; 
i++22
)
3435
22
        if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i]))
3436
22
        {
3437
20
          ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3438
20
          const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i]);
3439
21
          for (j = 0; j < d; 
j++1
)
3440
1
            prep = prep->p;
3441
20
          prep->while_count_tensor = 1;
3442
20
        }
3443
21
    }
3444
7.67k
    
for (i = 0; 7.63k
i < node->graph_ref_size;
i++49
)
3445
49
    {
3446
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3447
49
      if (graph_ref >= 0)
3448
49
        _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3449
49
    }
3450
7.63k
  } ccv_nnc_graph_visit_endfor
3451
1.17k
}
3452
3453
static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3454
22.1k
{
3455
22.1k
  if (symbol >= 0)
3456
16.8k
    return graph_prep->tensor_arena->vt_tensors[symbol];
3457
5.24k
  if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3458
5.22k
    return 0;
3459
20
  assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol));
3460
20
  const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3461
20
  int i;
3462
20
  const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol);
3463
21
  for (i = 0; i < d; 
i++1
)
3464
1
    prep = prep->p;
3465
20
  assert(prep->while_count_tensor);
3466
20
  return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3467
20
}
3468
3469
static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3470
1.17k
{
3471
1.17k
  int i;
3472
1.17k
  int* const exec_cvt = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
3473
1.17k
  ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3474
1.17k
  graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3475
1.17k
  graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3476
1.17k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3477
11.9k
  for (i = 0; i < graph_exec_arena->graph_exec_size; 
i++10.7k
)
3478
10.7k
    if (graph_execs[i].graph == graph)
3479
7.62k
      graph_execs[i].d = exec_cvt[graph_execs[i].d];
3480
1.17k
  ccfree(exec_cvt);
3481
1.17k
}
3482
3483
static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3484
1.17k
{
3485
1.17k
  int i, j, k;
3486
1.17k
  ccv_nnc_graph_t* const graph = graph_prep->graph;
3487
1.17k
  const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3488
1.17k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
3489
1.17k
  graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3490
1.17k
  graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3491
1.17k
  graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3492
1.17k
  graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3493
1.17k
  memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3494
1.17k
  ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3495
1.17k
  int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3496
11.9k
  for (i = 0; i < exec_symbol_info_size; 
i++10.7k
)
3497
10.7k
  {
3498
10.7k
    max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size);
3499
10.7k
    max_output_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].output_size);
3500
10.7k
    if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3501
22
      max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size);
3502
10.7k
    graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3503
10.7k
    graph_execs[i].graph = 0;
3504
10.7k
  }
3505
1.22k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
3506
50
    max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size);
3507
1.17k
  ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)];
3508
1.17k
  ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)];
3509
1.17k
  ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)];
3510
1.17k
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3511
1.17k
  const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3512
1.17k
  // Create node, this is in topological order.
3513
7.62k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3514
7.62k
    if (CCV_NO_GRAPH_EXEC(graph_execs[idx]))
3515
7.62k
    {
3516
29.7k
      for (i = 0; i < node->input_size; 
i++22.0k
)
3517
22.0k
        max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3518
20.5k
      for (i = 0; i < node->output_size; 
i++12.9k
)
3519
12.9k
        max_outputs[i] = node->outputs[i] >= 0 ? 
tensor_arena->vt_tensors[node->outputs[i]]11.7k
:
01.18k
;
3520
7.62k
      if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3521
21
      {
3522
21
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[0] - 1;
3523
21
        assert(graph_ref >= 0);
3524
21
        ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3525
21
        ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3526
21
        graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3527
21
        const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3528
21
        ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3529
21
        ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3530
43
        for (i = 0; i < node->p_while.input_size; 
i++22
)
3531
22
          max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3532
42
        for (i = 0; i < sub_symbolic_graph->breakpoint_size; 
i++21
)
3533
21
          max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3534
21
        ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3535
21
        _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3536
7.60k
      } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3537
24
        for (i = 0; i < node->output_size; 
i++13
)
3538
13
          if (max_outputs[i] && max_outputs[i]->alias_ref)
3539
10
            max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3540
11
        graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3541
11
        // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3542
22
        for (i = 0; i < node->case_of.argument.offset; 
i++11
)
3543
11
        {
3544
11
          ccv_nnc_tensor_t* const update = max_inputs[i];
3545
11
          if (!CCV_IS_TENSOR_MULTIVIEW(update)) // No need if it is a naked tensor.
3546
11
            
continue9
;
3547
2
          int flag = 0;
3548
2
          for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; 
j++0
)
3549
0
            flag = (update == max_inputs[j]);
3550
2
          if (!flag)
3551
2
            ccv_nnc_graph_exec_add_update(graph, graph_execs[idx], update);
3552
2
        }
3553
11
        const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 
11
:
010
;
3554
11
        ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3555
11
        if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3556
1
        {
3557
1
          // Add another graph for data transfer.
3558
1
          ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3559
2
          for (i = 0; i < node->output_size; 
i++1
)
3560
1
            max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 
00
;
3561
1
          ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size), max_outputs, ccv_min(node->input_size, node->output_size));
3562
1
          ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3563
1
          ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3564
1
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3565
1
          int exec_cvt;
3566
1
          ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3567
1
        }
3568
39
        for (i = 0; i < node->graph_ref_size; 
i++28
)
3569
28
        {
3570
28
          const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3571
28
          if (graph_ref < 0)
3572
0
            continue;
3573
28
          ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3574
28
          const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref);
3575
28
          ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3576
28
          ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3577
28
          _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3578
28
        }
3579
7.59k
      } else {
3580
7.59k
        graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3581
7.59k
      }
3582
7.62k
      ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3583
7.62k
    }
3584
7.62k
  } ccv_nnc_graph_visit_endfor
3585
1.17k
  // Then connect them.
3586
7.62k
  ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx) {
3587
7.62k
    if (node->outgoings)
3588
13.4k
      
for (i = 0; 6.17k
i < node->outgoings->rnum;
i++7.25k
)
3589
7.25k
      {
3590
7.25k
        const int outgoing = *(int*)ccv_array_get(node->outgoings, i);
3591
7.25k
        if (graph_execs[outgoing].graph)
3592
7.23k
          ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3593
7.25k
      }
3594
7.62k
  } ccv_nnc_graph_visit_endfor
3595
1.17k
  int source_exec_created = 0;
3596
1.17k
  const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3597
1.17k
  const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3598
1.17k
  ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3599
1.17k
  // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3600
21.3k
  for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; 
i++20.2k
)
3601
20.2k
  {
3602
20.2k
    if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags))
3603
20.2k
    {
3604
60
      int ref = i;
3605
60
      while (tensor_symbol_info[ref].alias_ref)
3606
0
        ref = tensor_symbol_info[ref].alias_ref - 1;
3607
60
      while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]) && 
tensor_blocks[ref].ref4
)
3608
0
        ref = tensor_blocks[ref].ref - 1;
3609
60
      // This is not computable. It could be that we marked a const tensor as init zero.
3610
60
      if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref]))
3611
60
        
continue4
;
3612
56
      // If this tensor is not used by any exec, we don't need to init at all. Skip.
3613
56
      if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3614
0
        continue;
3615
56
      ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3616
56
      // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3617
56
      ccv_nnc_graph_exec_t set_exec;
3618
56
      if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3619
42
        set_exec = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, CMD_BLAS(0), 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3620
14
      else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3621
14
        set_exec = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, CMD_BLAS(1), 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3622
114
      for (j = 0; j < tensor_blocks[ref].head->rnum; 
j++58
)
3623
58
      {
3624
58
        const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j);
3625
58
        if (outgoing >= exec_symbol_info_size)
3626
0
          continue;
3627
58
        assert(outgoing >= 0);
3628
58
        assert(graph_execs[outgoing].graph);
3629
58
        ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3630
58
      }
3631
56
      int flags = 0;
3632
56
      if (alloc_dep[ref])
3633
35
        
for (j = 0; 17
j < alloc_dep[ref]->rnum;
j++18
)
3634
18
        {
3635
18
          const int d = *(int*)ccv_array_get(alloc_dep[ref], j);
3636
18
          // This is from alloc_dep, it should be computable.
3637
18
          assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]));
3638
18
          if (tensor_blocks[d].tail)
3639
36
            
for (k = 0; 18
k < tensor_blocks[d].tail->rnum;
k++18
)
3640
18
            {
3641
18
              const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k);
3642
18
              if (incoming >= exec_symbol_info_size)
3643
0
                continue;
3644
18
              assert(incoming >= 0);
3645
18
              assert(graph_execs[incoming].graph);
3646
18
              ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3647
18
              flags = 1;
3648
18
            }
3649
18
        }
3650
56
      // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3651
56
      if (!flags)
3652
39
      {
3653
39
        if (!source_exec_created)
3654
19
        {
3655
19
          graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3656
19
          source_exec_created = 1;
3657
19
        }
3658
39
        ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3659
39
      }
3660
56
    }
3661
20.2k
  }
3662
1.17k
  // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3663
1.17k
  // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3664
1.17k
  // with its alias).
3665
1.17k
  assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size);
3666
21.3k
  
for (i = 0; 1.17k
i < tensor_arena->vt_tensor_size;
i++20.2k
)
3667
20.2k
  {
3668
20.2k
    ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3669
20.2k
    // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3670
20.2k
    if (mv && 
CCV_IS_TENSOR_MULTIVIEW20.0k
(mv))
3671
20.2k
    {
3672
53
      const ccv_array_t* const head = tensor_blocks[i].head;
3673
53
      if (head && 
head->rnum > 047
)
3674
94
        
for (j = 0; 47
j < head->rnum;
j++47
)
3675
47
        {
3676
47
          const int idx = *(int*)ccv_array_get(head, j);
3677
47
          if (idx >= exec_symbol_info_size)
3678
1
            continue;
3679
46
          assert(idx >= 0);
3680
46
          const int d = graph_execs[idx].d;
3681
46
          ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d);
3682
46
          int flag = 0;
3683
46
          if (exec_info->tensor_wraps_ref)
3684
32
          {
3685
32
            ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1);
3686
113
            for (k = 0; k < tensor_wrap_array->size && 
!flag88
;
k++81
)
3687
81
              flag = (tensor_wrap_array->tensor_wraps[k] && 
tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv55
);
3688
32
          }
3689
46
          // If none is in the flag, it need to be included in the cast.
3690
46
          if (!flag)
3691
19
            ccv_nnc_graph_exec_add_update(graph, graph_execs[idx], mv);
3692
46
        }
3693
53
    }
3694
20.2k
  }
3695
1.17k
  // Create source / destination phony node. This is to facilitate use of compiled graph.
3696
1.17k
  // Also, this is needed if you have init zero execs.
3697
1.17k
  if (source_exec_created || 
source_size > 11.15k
)
3698
49
  {
3699
49
    if (!source_exec_created)
3700
30
      graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3701
207
    for (i = 0; i < source_size; 
i++158
)
3702
158
      ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3703
1.12k
  } else {
3704
1.12k
    assert(!source_exec_created);
3705
1.12k
    assert(source_size == 1);
3706
1.12k
    graph_exec_arena->source = graph_execs[sources[0].d];
3707
1.12k
  }
3708
1.17k
  if (destination_size == 1)
3709
1.15k
    graph_exec_arena->destination = graph_execs[destinations[0].d];
3710
24
  else {
3711
24
    graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3712
345
    for (i = 0; i < destination_size; 
i++321
)
3713
321
      ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3714
24
  }
3715
1.17k
  ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3716
1.17k
  ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3717
1.17k
  return graph_exec_arena;
3718
1.17k
}
3719
3720
static ccv_nnc_graph_t* _ccv_nnc_graph_find_peer(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const peer)
3721
11
{
3722
11
  if (graph_prep->symbolic_graph == peer)
3723
4
    return graph_prep->graph;
3724
7
  int i;
3725
10
  for (i = 0; i < graph_prep->sub_prep_size; 
i++3
)
3726
7
    if (graph_prep->sub_preps[i])
3727
7
    {
3728
7
      ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_peer(graph_prep->sub_preps[i], peer);
3729
7
      if (graph)
3730
4
        return graph;
3731
7
    }
3732
7
  
return 03
;
3733
7
}
3734
3735
static void _ccv_nnc_graph_fixup_peer(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3736
1.12k
{
3737
1.12k
  int i;
3738
1.17k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++43
)
3739
43
    if (graph_prep->sub_preps[i])
3740
42
    {
3741
42
      if (graph_prep->sub_preps[i]->symbolic_graph->peer)
3742
4
        graph_prep->sub_preps[i]->graph->peer = _ccv_nnc_graph_find_peer(root_prep, graph_prep->sub_preps[i]->symbolic_graph->peer);
3743
42
    }
3744
1.12k
}
3745
3746
static void _ccv_nnc_graph_exec_arena_fixup_peer_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3747
1.17k
{
3748
1.17k
  assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph);
3749
1.17k
  int i;
3750
11.9k
  for (i = 0; i < graph_prep->exec_symbol_info_size; 
i++10.7k
)
3751
10.7k
  {
3752
10.7k
    if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags))
3753
10.7k
      
continue6
;
3754
10.7k
    if (graph_exec_arena->graph_execs[i].graph && 
graph_prep->exec_symbol_info[i].peer_ref7.62k
)
3755
3.23k
    {
3756
3.23k
      ccv_nnc_graph_exec_t peer_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
3757
3.23k
        .d = graph_prep->exec_symbol_info[i].peer_ref - 1,
3758
3.23k
        .graph = graph_prep->symbolic_graph->peer ? 
graph_prep->symbolic_graph->peer4
:
graph_prep->symbolic_graph3.23k
,
3759
3.23k
      });
3760
3.23k
      if (peer_exec.d >= 0)
3761
213
        ccv_nnc_graph_exec_set_peer(graph_prep->graph, graph_exec_arena->graph_execs[i], peer_exec);
3762
3.23k
    }
3763
10.7k
  }
3764
1.22k
  for (i = 0; i < graph_prep->sub_prep_size; 
i++50
)
3765
50
    if (graph_prep->sub_preps[i])
3766
49
      _ccv_nnc_graph_exec_arena_fixup_peer_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
3767
1.17k
}
3768
3769
static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3770
1.17k
{
3771
1.17k
  int i;
3772
1.17k
  if (graph_prep->dup_breakpoints)
3773
2
  {
3774
2
    // Strip the const modifier only possible because it is a sub-graph.
3775
2
    ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
3776
4
    for (i = 0; i < graph_prep->dup_breakpoints->rnum; 
i++2
)
3777
2
      ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i));
3778
2
    ccv_array_free(graph_prep->dup_breakpoints);
3779
2
    graph_prep->dup_breakpoints = 0;
3780
2
    graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3781
2
    // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
3782
2
    memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
3783
2
    // Since exec_symbol_info changed, create a new visit object.
3784
2
    assert(symbolic_graph->sources);
3785
2
    assert(symbolic_graph->destinations);
3786
2
    ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0);
3787
2
    const int source_size = symbolic_graph->sources->rnum;
3788
2
    ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0);
3789
2
    const int destination_size = symbolic_graph->destinations->rnum;
3790
4
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new2
(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
3791
4
    ccv_nnc_graph_visit_free(graph_prep->visit);
3792
4
    graph_prep->visit = visit;
3793
4
    assert(graph_prep->p);
3794
4
    ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
3795
2
  }
3796
7.62k
  ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx) {
3797
7.67k
    for (i = 0; i < node->graph_ref_size; 
i++49
)
3798
49
    {
3799
49
      const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
3800
49
      if (graph_ref >= 0)
3801
49
        _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
3802
49
    }
3803
7.62k
  } ccv_nnc_graph_visit_endfor
3804
1.17k
}
3805
3806
void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
3807
1.12k
{
3808
1.12k
  assert(graph_ref);
3809
1.12k
  assert(tensor_arena_ref);
3810
1.12k
  assert(graph_exec_arena_ref);
3811
1.12k
  int i;
3812
1.12k
  // Cannot bind the multi-view.
3813
11.0k
  for (i = 0; i < tensor_bind_size; 
i++9.89k
)
3814
9.89k
  {
3815
9.89k
    assert(tensor_binds[i].tensor);
3816
9.89k
    assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor));
3817
9.89k
  }
3818
1.12k
  ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
3819
1.12k
  _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
3820
1.12k
  ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, 0, tensor_binds, tensor_bind_size);
3821
1.12k
  _ccv_nnc_tensor_arena_fixup_peer_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
3822
1.12k
  *tensor_arena_ref = tensor_arena;
3823
1.12k
  // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
3824
1.12k
  _ccv_nnc_graph_fixup_peer(graph_prep, graph_prep);
3825
1.12k
  // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
3826
1.12k
  _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
3827
1.12k
  *graph_ref = graph_prep->graph;
3828
1.12k
  ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
3829
1.12k
  _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
3830
1.12k
  _ccv_nnc_graph_exec_arena_fixup_peer_ref(graph_exec_arena, graph_prep, graph_exec_arena);
3831
1.12k
  *graph_exec_arena_ref = graph_exec_arena;
3832
1.12k
  _ccv_nnc_symbolic_graph_prep_free(graph_prep);
3833
1.12k
}
3834
3835
static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
3836
1.17k
{
3837
1.17k
  // Buffers are inherited from above, no need to dealloc.
3838
1.17k
  int i;
3839
1.22k
  for (i = 0; i < tensor_arena->sub_arena_size; 
i++50
)
3840
50
    if (tensor_arena->sub_arenas[i])
3841
49
      _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
3842
1.23k
  for (i = 0; i < tensor_arena->m_tensor_idx->rnum; 
i++61
)
3843
61
  {
3844
61
    ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i));;
3845
61
    assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv));
3846
61
    ccv_nnc_tensor_multiview_free(*mv);
3847
61
  }
3848
1.17k
  ccv_array_free(tensor_arena->tensor_metadata);
3849
1.17k
  ccv_array_free(tensor_arena->m_tensor_idx);
3850
1.17k
  ccfree(tensor_arena);
3851
1.17k
}
3852
3853
void ccv_nnc_tensor_bind_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
3854
59.8k
{
3855
59.8k
  assert(tensor_arena->graph_ref == (intptr_t)symbol.graph);
3856
59.8k
  assert(symbol.d < tensor_arena->vt_tensor_size);
3857
59.8k
  tensor_arena->vt_tensors[symbol.d]->data.ptr = tensor->data.ptr;
3858
59.8k
}
3859
3860
uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
3861
1
{
3862
1
  uint64_t total_size = 0;
3863
1
  int i;
3864
19
  for (i = 0; i < tensor_arena->buffer_size; 
i++18
)
3865
18
    total_size += tensor_arena->buffers[i].size;
3866
1
  return total_size;
3867
1
}
3868
3869
void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
3870
1.12k
{
3871
1.12k
  int i;
3872
5.20k
  for (i = 0; i < tensor_arena->buffer_size; 
i++4.08k
)
3873
4.08k
  {
3874
4.08k
    const int buffer_type = tensor_arena->buffers[i].type;;
3875
4.08k
    const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type);
3876
4.08k
#ifdef HAVE_CUDA
3877
4.08k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type);
3878
4.08k
    if (memory_type == CCV_TENSOR_GPU_MEMORY)
3879
544
      cufree(device_id, tensor_arena->buffers[i].ptr);
3880
3.53k
    else {
3881
3.53k
      assert(memory_type == CCV_TENSOR_CPU_MEMORY);
3882
3.53k
      if (tensor_arena->buffers[i].pin_mem)
3883
10
        cuhostfree(tensor_arena->buffers[i].ptr);
3884
3.53k
      else
3885
3.53k
        
ccfree3.52k
(tensor_arena->buffers[i].ptr)3.52k
;
3886
3.53k
    }
3887
#else
3888
    assert(memory_type == CCV_TENSOR_CPU_MEMORY);
3889
    ccfree(tensor_arena->buffers[i].ptr);
3890
#endif
3891
  }
3892
1.12k
  _ccv_nnc_tensor_arena_free(tensor_arena);
3893
1.12k
}
3894
3895
void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3896
1.17k
{
3897
1.17k
  int i;
3898
1.22k
  for (i = 0; i < graph_exec_arena->sub_arena_size; 
i++50
)
3899
50
    if (graph_exec_arena->sub_arenas[i])
3900
49
      ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
3901
1.17k
  ccfree(graph_exec_arena);
3902
1.17k
}