ccv_nnc_symbolic_graph

Bug Summary

File:	nnc/ccv_nnc_symbolic_graph_compile.c
Warning:	line 3779, column 7 The left operand of '==' is a garbage value
Annotated Source Code

Press '?' to see keyboard shortcuts
Show analyzer invocation
clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2026-02-25-101053-1436736-1 -x c ccv_nnc_symbolic_graph_compile.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "ccv_nnc_easy.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include "_ccv_nnc_graph.h"
11#include "_ccv_nnc_symbolic_graph.h"
12 
13// MARK - Level-3 API
14 
15typedef struct {
16	int flags;
17	int type;
18	int pin_mem; // This memory need to be pinned.
19	int ref; // Reference to another tensor block. Start with 1.
20	int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21	int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22	int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23	int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24	ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25	uint64_t size; // The size of the tensor expected.
26	int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27	ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28	ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29	ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30} ccv_nnc_tensor_block_t; // Tensor Arena Block
31 
32#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
33 
34enum {
35	UNASSIGNED = 0x1,
36	ALIAS = 0x2,
37	READ_ONLY = 0x4,
38	WRITE_ONLY = 0x8,
39	READ_WRITE = 0xc,
40	ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41	UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42	UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43};
44 
45#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
46#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
47#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
48#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
50#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
51#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED)) (!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
52#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
53#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
54#define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
56#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
58#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
60 
61#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
 & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
62 
63// Holds additional information about the exe nodes.
64typedef struct {
65	int flags;
66} ccv_nnc_graph_exec_flag_t;
67 
68enum {
69	CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70};
71 
72typedef struct {
73	int index;
74	int oc;
75	int type;
76	uint64_t size;
77} ccv_nnc_tensor_opt_t;
78 
79// We first sort the same type together (because they won't be reused at all.
80// And then we sort by size, after that, sort by oc.
81#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
 *array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
 t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
83#undef more_than
84typedef struct {
85	int idx;
86	int hop;
87} ccv_nnc_tensor_hop_t;
88#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t
 total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t
 t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t
* left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t
* ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && less_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t
* left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0
; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t
* a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux
) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b
, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? (
less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a)
) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = less_than
(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, *
c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than
(*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !less_than(*pivot, *left
, aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !less_than(*right, *pivot, aux) ) { if( !less_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
90#undef less_than
91 
92// If b has items overlap with a, a is still after b (inclusive).
93static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94{
95	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__
); }));
96	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__
); }));
97	int x, y;
98	for (x = 0; x < b->rnum; x++)
99	{
100		const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)));
101		int flag = 0;
102		// In extreme cases where a is a superset of b, then a is still after b, we are good.
103		for (y = 0; !flag && y < a->rnum; y++)
104		{
105			const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)));
106			flag = (p == q);
107		}
108		if (!flag)
109			for (y = 0; y < a->rnum; y++)
110			{
111				ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y))), p);
112				if (!cell.i32 || cell.i32[0] == 0)
113					return 0;
114			}
115	}
116	// If b->rnum == 0, a is after b for sure.
117	// Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118	// if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119	return (a->rnum > 0 || b->rnum == 0);
120}
121 
122static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123{
124	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__
 __PRETTY_FUNCTION__); }));
125	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__
 __PRETTY_FUNCTION__); }));
126	if (!a->rnum || !b->rnum)
127		return 0;
128	int x, y, max_hop = 0;
129	for (x = 0; x < a->rnum; x++)
130	{
131		ccv_sparse_matrix_vector_t* const vector = ccv_get_sparse_matrix_vector(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x))));
132		if (!vector)
133			return 0;
134		for (y = 0; y < b->rnum; y++)
135		{
136			const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y))));
137			if (!cell.i32 || cell.i32[0] == 0)
138				return 0;
139			if (cell.i32[0] > max_hop)
140				max_hop = cell.i32[0];
141		}
142	}
143	// We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
144	// The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
145	return max_hop;
146}
147 
148// If every a's head is deterministically after b's tail
149static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
150{
151	return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
152}
153 
154typedef struct {
155	ccv_array_t** alloc_dep;
156	int vt_block_size;
157	int buffer_size;
158	int block_size;
159	int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
160	struct {
161		int type; // The type from tensor blocks.
162		int pin_mem; // Whether this is pinned memory.
163		int flags; // The flags (currently for READ_ONLY or not).
164		uint64_t size; // The size of the buffer allocated.
165		int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
166		ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
167	}* buffers;
168	struct {
169		int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
170		int block_ref; // A reference to which block in the given tensor_block to use.
171		uint64_t offset; // The offset of this block.
172	}* blocks;
173} ccv_nnc_tensor_alloc_prep_t;
174 
175typedef struct ccv_nnc_symbolic_graph_prep_s {
176	int flags;
177	int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
178	int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
179	int exec_idx;
180	int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
181	int tensor_symbol_info_size;
182	int exec_symbol_info_size;
183	int tensor_block_size;
184	int sub_prep_size;
185	ccv_nnc_tensor_block_t* tensor_blocks;
186	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
187	ccv_nnc_graph_exec_flag_t* exec_flags;
188	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
189	int* dup_tensor_block_ref;
190	ccv_nnc_graph_visit_t* visit;
191	ccv_nnc_tensor_alloc_prep_t* alloc_prep;
192	struct ccv_nnc_symbolic_graph_prep_s* p;
193	struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
194	// Structures that don't require to be freed after deallocation.
195	const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
196	ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
197	ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
198	ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
199} ccv_nnc_symbolic_graph_prep_t;
200 
201typedef struct {
202	int oc;
203	ccv_array_t* itf;
204} ccv_nnc_tensor_block_adjacent_t;
205 
206static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
207{
208	// Compute how many dis-continuous buffers are needed.
209	// We prefer to have several dis-continuous buffers instead of one big buffer because
210	// in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
211	// to fully utilize memory.
212	int i, j, k;
213	ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
214	int allocable_tensor_size = 0, available_tensor_size = 0;
215	for (i = 0; i < tensor_block_size; i++)
216		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
217		{
218			// Tensors that we need the header info.
219			++available_tensor_size;
220			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
221				// Tensors that we actually need to allocate (exclude the alias).
222				++allocable_tensor_size;
223		}
224	ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
225	ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
226	ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
227	// Overlap count.
228	for (i = 0; i < tensor_block_size; i++)
229		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
230			for (j = i + 1; j < tensor_block_size; j++)
231				if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED)))
232				{
233					// We only reuse buffers within the same memory type. The tensor_dt / tensor_df
234					// matrices are only queried later for same-type candidates in this function,
235					// thus cross-type hop relations are not needed for allocation planning here.
236					if (tensor_blocks[i].type != tensor_blocks[j].type)
237						continue;
238					// Check to see if they interfere (default to yes).
239					// If any of the i's head is deterministically later than j's tail
240					// or any of the i's tail is deterministically earlier than j's head, they don't interfere.
241					const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
242					int j_hop_i = 0;
243					if (i_hop_j > 0)
244					{
245						ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
246						ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
247					} else {
248						// It cannot be that both directions are positive. If i can hop to j, we don't
249						// need the reverse hop value for any subsequent allocation decision.
250						j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
251						if (j_hop_i > 0)
252						{
253							ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
254							ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
255						}
256					}
257					if (!i_hop_j && !j_hop_i)
258					{
259						if (!adj[i].itf)
260							adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
261						ccv_array_push(adj[i].itf, &j);
262						++adj[i].oc;
263						if (!adj[j].itf)
264							adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
265						ccv_array_push(adj[j].itf, &i);
266						++adj[j].oc;
267					}
268				}
269	const int exec_dep_rows = exec_dep->rows;
270	ccv_matrix_free(exec_dep);
271	ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
272	int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
273	uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
274	uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
275	uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
276	int num_assigned = 0; 
277	// I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
278	// Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
279	// The first channel denotes the bytes available for allocation,
280	// the second channel denotes the offset available for the allocation,
281	ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
282	ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
283	for (j = 0; j < allocable_tensor_size;)
284	{
285		// Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
286		uint64_t max_size = 0;
287		ccv_array_clear(opt);
288		int current_type = 0; // Deal with one type at a time.
289		for (i = 0; i < tensor_block_size; i++)
290			if (tensor_blocks[i].size >= max_size &&
291				TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && !assigned[i] &&
292				IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
293				(!current_type || tensor_blocks[i].type == current_type))
294			{
295				ccv_nnc_tensor_opt_t a = {
296					.size = tensor_blocks[i].size,
297					.index = i,
298					.oc = adj[i].oc,
299					.type = tensor_blocks[i].type,
300				};
301				assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 301, __extension__ __PRETTY_FUNCTION__); }));
302				current_type = a.type; // Now we now the primary type we should deal with.
303				if (tensor_blocks[i].companion_ref)
304				{
305					const int companion_ref = tensor_blocks[i].companion_ref - 1;
306					a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; });
307					a.oc += adj[companion_ref].oc;
308				}
309				// In case we have a tie, take them all in the array.
310				if (a.size > max_size)
311					ccv_array_clear(opt), max_size = a.size;
312				ccv_array_push(opt, &a);
313			}
314		assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
 ({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 314, __extension__ __PRETTY_FUNCTION__
); }));
315		// Order opt array by the oc because type and size should be equal at this point.
316		_ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
317		// Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
318		int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
319		uint64_t min_val[2] = {
320			0, 0
321		};
322		if (j > 0)
323		{
324			for (i = 0; i < opt->rnum; i++)
325			{
326				ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(i)));
327				if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
328					continue;
329				// Now, determine the order between a and c. After this, we can always check whether y
330				// can hop to the earliest one and if the latest one can hop to x.
331				// The earliest one will be called p and the latest one will be called q.
332				int p = a.index;
333				int q = a.index;
334				if (tensor_blocks[a.index].companion_ref)
335				{
336					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
337					if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
338						continue;
339					const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
340					if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
341						p = companion_ref;
342					else {
343						const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
344						if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
345							q = companion_ref;
346						else { // Otherwise, b is in between p and q.
347							const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
348							const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
349							assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
 && b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
 : 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
 0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 349, __extension__ __PRETTY_FUNCTION__
); }));
350						}
351					}
352				}
353				assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 353, __extension__ __PRETTY_FUNCTION__
); }));
354				const int type = tensor_blocks[p].type;
355				// y is always earlier than x, but this is hard to assert now.
356				// If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
357				// Thus, the hop between y and x (through a) should be smallest ones.
358				// We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
359				// out of q. For these nodes, we try to verify whether they form a connection (by checking against
360				// alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
361				int y_size = 0;
362				ccv_nnc_tensor_hop_t* const y_buf = buf;
363#define for_block(y, val) do { \
364					if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \
365						y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
366							.idx = y + 1, .hop = ((int*)val)[0] \
367						}; \
368				} while(0)
369				ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
370				if (y_vector)
371					CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
372#undef for_block
373				assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
 ({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 373, __extension__ __PRETTY_FUNCTION__); }));
374				int x_size = 0;
375				ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
376#define for_block(x, val) do { \
377					if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \
378						x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
379							.idx = x + 1, .hop = ((int*)val)[0] \
380						}; \
381				} while(0)
382				ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
383				if (x_vector)
384					CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
385#undef for_block
386				assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
 : 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 386, __extension__ __PRETTY_FUNCTION__
); }));
387				int x, y;
388				if (y_size > 1)
389					_ccv_nnc_sort_by_hops(y_buf, y_size, 0);
390				for (y = 0; y < y_size; y++)
391				{
392					const int hop = exec_dep_rows + y_buf[y].hop;
393					if (hop >= min_hop)
394						break;
395					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
396					if (val.u64 && val.u64[0] >= a.size)
397					{
398						min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
399							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
400						break;
401					}
402				}
403				if (x_size > 1)
404					_ccv_nnc_sort_by_hops(x_buf, x_size, 0);
405				for (x = 0; x < x_size; x++)
406				{
407					const int hop = exec_dep_rows + x_buf[x].hop;
408					if (hop >= min_hop)
409						break;
410					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
411					if (val.u64 && val.u64[0] >= a.size)
412					{
413						min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
414							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
415						break;
416					}
417				}
418				if (x_size > 0)
419				{
420					const int x_min_hop = x_buf[0].hop;
421					for (y = 0; y < y_size; y++)
422					{
423						const int y_hop_p_v = y_buf[y].hop;
424						if (y_hop_p_v + x_min_hop >= min_hop)
425							break;
426						ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
427						if (y_vector)
428						{
429							for (x = 0; x < x_size; x++)
430							{
431								const int q_hop_x_v = x_buf[x].hop;
432								const int hop = y_hop_p_v + q_hop_x_v;
433								if (hop >= min_hop)
434									break;
435								const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
436								if (val.u64 && val.u64[0] >= a.size)
437								{
438									min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
439										min_val[0] = val.u64[0], min_val[1] = val.u64[1];
440									break;
441								}
442							}
443						}
444					}
445				}
446				// If I found a place, stop, and exit.
447				if (min_y > 0 || min_x < tensor_block_size + 1)
448				{
449					min_i = i;
450					break;
451				}
452				// There is no space to insert this block, mark it as such.
453				tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
454				if (tensor_blocks[a.index].companion_ref)
455				{
456					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
457					tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
458				}
459			}
460		}
461		// If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
462		// and default to largest size available.
463		ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))));
464		if (min_i == -1)
465		{
466			allocated_size[num_assigned] = a.size;
467			++num_assigned;
468		}
469		int assign_group = num_assigned;
470		if (min_y > 0)
471		{
472			assign_group = assigned[min_y - 1];
473			// The y and x should belong to the same assigned group.
474			assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
 - 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
 tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 474, __extension__ __PRETTY_FUNCTION__
); }));
475		} else if (min_x < tensor_block_size + 1)
476			assign_group = assigned[min_x - 1];
477		// If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
478		if (min_y != 0 || min_x != tensor_block_size + 1)
479		{
480			uint64_t val[2] = {
481				min_val[0], min_val[1]
482			};
483			assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
 ({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 483, __extension__ __PRETTY_FUNCTION__
); }));
484			val[0] -= a.size;
485			val[1] = val[1] + a.size; // Move the offset to the next one.
486			ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
487		}
488		int strings[3];
489		strings[0] = a.index + 1;
490		int string_size = 1;
491		// Assign out designated companion if it exist.
492		if (tensor_blocks[a.index].companion_ref)
493		{
494			const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
495			assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
 ("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 495, __extension__ __PRETTY_FUNCTION__
); }));
496			const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
497			if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
498			{
499				for (i = 0; i < string_size; i++)
500					strings[i + 1] = strings[i];
501				strings[0] = companion_ref + 1;
502			} else {
503				const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
504				if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
505					strings[string_size] = companion_ref + 1;
506				else {
507					// Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
508					assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
 if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 508, __extension__ __PRETTY_FUNCTION__
); }));
509					strings[2] = strings[1];
510					strings[1] = companion_ref + 1;
511				}
512			}
513			++string_size;
514		}
515		// Assign out and update oc.
516		for (i = 0; i < string_size; i++)
517		{
518			const int index = strings[i] - 1;
519			// Assign out the selected one.
520			assigned[index] = assign_group;
521			// The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
522			allocated_offset[index] = min_val[1];
523			if (adj[index].itf)
524				for (k = 0; k < adj[index].itf->rnum; k++)
525				{
526					const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[
index].itf)->rsize * (size_t)(k)));
527					if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !(
(tensor_blocks[d].flags & 0x3) == UNASSIGNED)))
528						--adj[d].oc;
529				}
530		}
531		uint64_t val[2] = {
532			a.size, min_val[1]
533		};
534		uint64_t consumed_size = 0;
535		// Go over from min_y to string_size (excluding min_x).
536		for (i = 0; i < string_size; i++)
537		{
538			const uint64_t size = tensor_blocks[strings[i] - 1].size;
539			assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 539, __extension__ __PRETTY_FUNCTION__
); }));
540			// Update consumed size if it is bigger than "size".
541			if (size > consumed_size)
542			{
543				val[0] = size - consumed_size;
544				ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
545				consumed_size = size;
546				val[1] = min_val[1] + consumed_size;
547			}
548			// If it consumed all the flow, break out.
549			if (consumed_size == a.size)
550				break;
551		}
552		for (i = 0; i < string_size; i++)
553		{
554			const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
555			uint64_t val[2] = {
556				i_size, min_val[1]
557			};
558			uint64_t consumed_size = 0;
559			for (k = i + 1; k < string_size; k++)
560			{
561				const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
 < _b) ? _a : _b; });
562				// Update consumed size if it is bigger than "size".
563				if (size > consumed_size)
564				{
565					val[0] = size - consumed_size;
566					ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
567					consumed_size = size;
568					val[1] = min_val[1] + consumed_size;
569				}
570				// If it consumed all the flow, break out.
571				if (consumed_size == i_size)
572					break;
573			}
574			val[0] = i_size - consumed_size;
575			// Still have residual, flow it to min_x.
576			if (val[0] > 0)
577				ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
578		}
579		if (min_i == -1)
580		{
581			// If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
582			const int p = strings[0] - 1;
583			const int q = strings[string_size - 1] - 1;
584			const int type = tensor_blocks[p].type;
585#define for_block(y, val) do { \
586				if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \
587				{ \
588					tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
589					if (tensor_blocks[y].companion_ref) \
590					{ \
591						const int companion_ref = tensor_blocks[y].companion_ref - 1; \
592						tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
593					} \
594				} \
595			} while(0)
596			ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
597			if (y_vector)
598				CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
599#undef for_block
600#define for_block(x, val) do { \
601				if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \
602				{ \
603					tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
604					if (tensor_blocks[x].companion_ref) \
605					{ \
606						const int companion_ref = tensor_blocks[x].companion_ref - 1; \
607						tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
608					} \
609				} \
610			} while(0)
611			ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
612			if (x_vector)
613				CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
614#undef for_block
615		}
616		j += string_size;
617	}
618	ccfreefree(tensor_block_cannot_insert);
619	ccfreefree(buf);
620	ccv_array_free(opt);
621	ccv_matrix_free(tensor_df);
622	ccv_matrix_free(tensor_dt);
623#define for_block(y, x, val) do { \
624		if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
625		{ \
626			if (!alloc_dep[x - 1]) \
627				alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
628			ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
629		} \
630	} while (0)
631	CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
 ((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
 _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
 = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
 _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
 !_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
 (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
 { switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
 } while (0);
632#undef for_block
633	ccv_matrix_free(alloc);
634	for (i = 0; i < tensor_block_size; i++)
635		if (adj[i].itf)
636			ccv_array_free(adj[i].itf);
637	ccfreefree(adj);
638	ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
639	alloc_prep->alloc_dep = alloc_dep;
640	alloc_prep->vt_block_size = tensor_block_size;
641	alloc_prep->buffer_size = num_assigned;
642	alloc_prep->block_size = available_tensor_size;
643	alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
644	alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
645	alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
646	memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
647	for (i = 0; i < num_assigned; i++)
648		alloc_prep->buffers[i].size = allocated_size[i];
649	if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels()))
650	{
651		size_t total_size = 0;
652		for (i = 0; i < num_assigned; i++)
653			total_size += allocated_size[i];
654		PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf
("Total buffer size of %zu to be allocated\n", total_size); fflush
(stdout); } } while (0);
655	}
656	ccfreefree(allocated_size);
657	j = 0;
658	// Assigning out the tensors (in case of sharing tensors / in-place ops).
659	for (i = 0; i < tensor_block_size; i++)
660		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
661		{
662			alloc_prep->blocks[j].block_ref = i;
663			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
664			{
665				alloc_prep->vt_blocks[i] = j;
666				// Also, set its allocations.
667				assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 667, __extension__ __PRETTY_FUNCTION__
); }));
668				const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
669				alloc_prep->blocks[j].offset = allocated_offset[i];
670				if (!alloc_prep->buffers[buffer_ref].type)
671					alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
672				alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
673				alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
674				assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
 alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
 ({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 674, __extension__ __PRETTY_FUNCTION__
); }));
675			} else {
676				alloc_prep->vt_blocks[i] = -1;
677				alloc_prep->blocks[j].buffer_ref = -1;
678				alloc_prep->blocks[j].offset = 0;
679			}
680			++j;
681		} else
682			alloc_prep->vt_blocks[i] = -1;
683	ccfreefree(allocated_offset);
684	ccfreefree(assigned);
685	return alloc_prep;
686}
687 
688static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
689{
690	int i;
691	for (i = 0; i < alloc_prep->vt_block_size; i++)
692		if (alloc_prep->alloc_dep[i])
693			ccv_array_free(alloc_prep->alloc_dep[i]);
694	for (i = 0; i < alloc_prep->buffer_size; i++)
695		if (alloc_prep->buffers[i].dup_p_refs)
696			ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
697	ccfreefree(alloc_prep->alloc_dep);
698	ccfreefree(alloc_prep);
699}
700 
701// Simple allocator from ccv_array_t.
702static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
703{
704	int pos = tensor_metadata->rnum;
705	int rsize = (size + 15) / 16;
706	ccv_array_resize(tensor_metadata, pos + rsize);
707	return (pos << 1) + 1;
708}
709 
710static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
711{
712	assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 712, __extension__ __PRETTY_FUNCTION__
); }));
713	return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)));
714}
715 
716#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
717 
718static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
719{
720	// If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
721	if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
722		return vt_tensor;
723	ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
724	if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
725	{
726		const int alias_ref = tensor->alias_ref;
727		tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
728		_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
729	}
730	if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
731	{
732		ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
733		int i;
734		const int count = mv->kind + mv->repeat;
735		for (i = 0; i < count; i++)
736		{
737			if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1))
738			{
739				const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
740				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
741				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
742			}
743		}
744		// No need to recursively do parent pointer, otherwise we are in deep rewire.
745		if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
746			mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
747		if (mv->sp)
748			for (i = 0; i < mv->sp->rnum; i++)
749			{
750				ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
751				if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
752				{
753					const int pos = (int)(intptr_t)*tensor;
754					*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
755					assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 755, __extension__ __PRETTY_FUNCTION__
); }));
756					_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
757				}
758			}
759	}
760	return tensor;
761}
762 
763typedef struct {
764	const uint8_t* ptr;
765	int pos;
766} ccv_nnc_tensor_block_pos_t;
767 
768static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
769{
770	int i;
771	int unref_block_ref = block_ref;
772	while (prep->tensor_blocks[unref_block_ref].ref)
773		unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
774	int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
775	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 775, __extension__ __PRETTY_FUNCTION__); }));
776	assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
 == prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
 ("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 776, __extension__ __PRETTY_FUNCTION__
); }));
777	const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
778	uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
779	int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
780	for (i = idx - 1; i >= 0; i--)
781	{
782		assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
 (p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 782, __extension__ __PRETTY_FUNCTION__); }));
783		const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
784		const int unroll_count = graph_prep->unroll_count;
785		if (ch[i]) // Prefer the dup side of things.
786			p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
787		int unref_p_ref = p_ref;
788		while (graph_prep->tensor_blocks[unref_p_ref].ref)
789			unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
790		vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
791		const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
792		offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
793		// If the buffer already exists, prefer that.
794		const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
795		if (ptr)
796		{
797			// If I have any remaining path that is not covered from 0, I cannot possibly
798			// have any pointer from buffer (that can only happen if it is not dup).
799			for (--i; i >= 0; i--)
800				if (ch[i] != 0)
801					return 0;
802			// Try to find the created tensor block pos in the array, just linear scan.
803			const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
804			ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
805			*tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
806			ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
807			return tv_pos;
808		}
809		p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
810	}
811	return 0;
812}
813 
814// Descent from root to the prep level, and compose multiview from there.
815static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
816{
817	assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 817, __extension__ __PRETTY_FUNCTION__); }));
818	int i;
819	const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
820	const int unroll_count = prep->unroll_count;
821	if (prep == graph_prep)
822	{
823		const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
824		if (!data_pos)
825			return -1;
826		// Based on ch, go all the way back to find the exact pointer to compose.
827		if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
828			prep->dup_tensor_block_ref &&
829			prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
830			prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
831		{
832			int pos[unroll_count + 1];
833			pos[0] = data_pos;
834			for (i = 0; i < unroll_count; i++)
835				pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
836			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
837			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
838			ccv_nnc_tensor_t* data[unroll_count + 1];
839			for (i = 0; i < unroll_count + 1; i++)
840				data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
841			ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
842			for (i = 0; i < unroll_count + 1; i++)
843				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
844			*pos_ref = mv_pos;
845		} else {
846			*pos_ref = data_pos;
847		}
848		if (preserve)
849		{
850			// If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
851			// at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
852			// mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
853			// mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
854			// arena allocated).
855			// mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
856			// a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
857			// it to a K01 structure.
858			// Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
859			// to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
860			// memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
861			int prev_mv_pos = *pos_ref;
862			if (prev_mv_pos == -1)
863			{
864				prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
865				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
866				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
867				ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
868					tv,
869				}, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
870				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
871			}
872			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
873			ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
874			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
875			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
876				CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
877				(ccv_nnc_tensor_t*)prev_mv,
878			}, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
879			prev_mv->p = (void*)(intptr_t)mv_pos;
880			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
881			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
882			*pos_ref = mv_pos;
883		}
884		return 0;
885	}
886	ch[idx] = 0;
887	int pos[unroll_count + 1];
888	pos[0] = 0;
889	const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
890	assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 890, __extension__ __PRETTY_FUNCTION__); }));
891	for (i = 0; i < unroll_count; i++)
892	{
893		ch[idx] = i + 1;
894		pos[i + 1] = 0;
895		const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
896		if (dup_retval < 0)
897		{
898			assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 898, __extension__ __PRETTY_FUNCTION__); }));
899			break;
900		}
901	}
902	// If current prep has no dup.
903	if (i == 0)
904	{
905		*pos_ref = pos[0];
906		return 0;
907	}
908	ccv_nnc_tensor_t* data[unroll_count + 1];
909	// Compose to a new multiview.
910	for (i = 0; i < unroll_count + 1; i++)
911		{ assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
 (pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 911, __extension__ __PRETTY_FUNCTION__); })); }
912	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
913	for (i = 0; i < unroll_count + 1; i++)
914		data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
915	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
916	ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
917	for (i = 0; i < unroll_count + 1; i++)
918		if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
919			((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
920	for (i = 0; i < unroll_count + 1; i++)
921		CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
922	*pos_ref = mv_pos;
923	return 0;
924}
925 
926static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
927{
928	int i;
929	int is_input = 0;
930	assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
 else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 930, __extension__ __PRETTY_FUNCTION__); }));
931	for (i = 0; i < node->input_size && !is_input; i++)
932		if (p_ref == node->inputs[i])
933			is_input = 1;
934	int is_output = 0;
935	for (i = 0; i < node->output_size && !is_output; i++)
936		if (p_ref == node->outputs[i])
937			is_output = 1;
938	// Prefer it is an output if it is both the input and the output.
939	if (is_output)
940		return 1;
941	if (is_input)
942		return -1;
943	return 0;
944}
945 
946static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
947{
948	// No need to check whether to preserve if this is not a while loop.
949	if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
950		return 0;
951	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 951, __extension__ __PRETTY_FUNCTION__
); }));
952	// If it is unassigned, no need to preserve.
953	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
954		return 0;
955	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
956	// If p is not input, no need to preserve at all.
957	if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
958		return 0;
959	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
960	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 960, __extension__ __PRETTY_FUNCTION__); }));
961	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 961, __extension__ __PRETTY_FUNCTION__
); }));
962	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
963	// If the buffer is a truly read-only one, no need to preserve.
964	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
965		return 0;
966	/* This needs detailed explanation, what does preserve mean?
967	 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
968	 * also used outside of the while loop, we cannot reuse the memory region of x for
969	 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
970	 * y uses the same memory region as x). The way to workaround this is by using a different
971	 * memory region for y = x + 1, but for the first iteration, having x pointing to the
972	 * original. During the allocation process, the way to identify whether x should preserve
973	 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
974	 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
975	 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
976	 * it is the input tensor whenever that is possible. A tensor block can point to two parent
977	 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
978	 * tensor whenever that is possible. */
979	if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
980		return 0;
981	// Otherwise, return 1 because we now need to preserve.
982	return 1;
983}
984 
985static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
986{
987	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 987, __extension__ __PRETTY_FUNCTION__
); }));
988	// If it is unassigned, no need to preserve.
989	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
990		return 0;
991	// Only tape var need to force broadcast, otherwise we already share the same memory region.
992	if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
993		return 0;
994	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
995	// If p is not output, no need to broadcast at all.
996	if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
997		return 0;
998	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
999	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 999, __extension__ __PRETTY_FUNCTION__); }));
1000	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1000, __extension__ __PRETTY_FUNCTION__
); }));
1001	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
1002	// If the buffer is a truly read-only one, no need to broadcast.
1003	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
1004		return 0;
1005	// Otherwise, return 1 because we now need to force broadcast for this tape var.
1006	return 1;
1007}
1008 
1009static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
1010{
1011	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1011, __extension__ __PRETTY_FUNCTION__); }));
1012	int i;
1013	for (i = 0; i < mv->kind + mv->repeat; i++)
1014		if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
1015			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = tensor;
1016		else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
1017			_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i], tensor);
1018}
1019 
1020static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1021{
1022	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1022, __extension__ __PRETTY_FUNCTION__); }));
1023	int i;
1024	if (mv->sp)
1025		for (i = 0; i < mv->sp->rnum; i++)
1026		{
1027			ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
1028			if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
1029			{
1030				const int pos = (int)(intptr_t)*tensor;
1031				*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1032				assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 1032, __extension__ __PRETTY_FUNCTION__
); }));
1033				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1034			}
1035		}
1036	for (i = 0; i < mv->kind + mv->repeat; i++)
1037	{
1038		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]) & 1))
1039			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1040		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]->alias_ref) & 1))
1041			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref);
1042		if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
1043			_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1044	}
1045}
1046 
1047static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1048{
1049	// Go to the root of the graph.
1050	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1051	int i;
1052	for (i = 1; prep->p; i++)
1053		prep = prep->p;
1054	// Root graph should have no dup tensor blocks.
1055	assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
 ({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1055, __extension__ __PRETTY_FUNCTION__); }));
1056	const int c = i;
1057	const ccv_nnc_symbolic_graph_prep_t* preps[c];
1058	prep = graph_prep;
1059	preps[c - 1] = prep;
1060	for (i = 0; prep->p; i++)
1061		preps[c - 2 - i] = prep = prep->p;
1062	int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1063	memset(ch, 0, sizeof(int) * c);
1064	int pos = 0;
1065	_ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1066	assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
 (ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1066, __extension__ __PRETTY_FUNCTION__); })); // This shouldn't never be modified.
1067	assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
 > 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1067, __extension__ __PRETTY_FUNCTION__); }));
1068	return pos;
1069}
1070 
1071static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1072{
1073	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1074	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1075	ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1076	ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1077		CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1078		tv,
1079	}, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1080	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1081	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = tensor;
1082	return mv_pos;
1083}
1084 
1085static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1086{
1087	ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1088	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1089	if (!is_multiview)
1090		return pos;
1091	while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1092	{
1093		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1094		tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1095	}
1096	const ccv_nnc_tensor_t tensor = *tensor_ptr;
1097	const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1098	ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1099	*new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1100	new_tensor->dataof = tensor.dataof;
1101	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1102	new_tensor->alias_ref = (uintptr_t)pos;
1103	ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1104	return new_pos;
1105}
1106 
1107static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1108{
1109	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1110	// It referenced to is not an alias.
1111	assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__
 ({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1111, __extension__ __PRETTY_FUNCTION__
); }));
1112	const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1113	const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1114	assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1114, __extension__ __PRETTY_FUNCTION__
); }));
1115	// Will use that to determine whether insert reference or not.
1116	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1117	while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1118	{
1119		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1120		alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1121	}
1122	const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1123	// If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1124	int pos;
1125	if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1126		ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim))
1127	{
1128		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1129		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1130		*tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1131		tensor->dataof = alias_tensor.dataof;
1132	} else {
1133		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1134		ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1135		// Otherwise initialize a tensor view
1136		*tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1137		tensor_view->alias_ref = (uintptr_t)alias_pos;
1138	}
1139	vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1140	if (is_multiview)
1141	{
1142		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1143		ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1144	}
1145}
1146 
1147static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1148{
1149	// If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1150	if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref])
1151	{
1152		const int ref = tensor_blocks[block_ref].alias_ref - 1;
1153		if (!vt_tensors[ref])
1154			_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1155		vt_tensors[block_ref] = vt_tensors[ref];
1156		return;
1157	}
1158	assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1158, __extension__ __PRETTY_FUNCTION__
); }));
1159	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1160	// If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1161	if (!vt_tensors[alias_ref])
1162		_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1163	_ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1164}
1165 
1166// Turn a linear pointer to an object storage (such as MTLBuffer).
1167#ifdef HAVE_MPS
1168static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1169{
1170	mpobjfree(0, ptr);
1171}
1172#endif
1173 
1174typedef struct {
1175	size_t size;
1176	void* obj;
1177} tensor_arena_obj_track_t;
1178 
1179typedef struct {
1180	void* ptr;
1181	off_t offset;
1182	size_t size;
1183} obj_ptr_key_t;
1184 
1185static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1186{
1187	return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1188}
1189 
1190static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1191{
1192	return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1193}
1194 
1195KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied
, upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* *
vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__
)) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t
*)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__
 ((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if
 (h) { free((void *)h->keys); free(h->flags); free((void
 *)h->vals); free(h); } } static inline __attribute__ ((__unused__
)) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h
->flags) { memset(h->flags, 0xaa, ((h->n_buckets) <
 16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t
 key) { if (h->n_buckets) { khint_t k, i, last, mask, step
 = 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key
); i = k & mask; last = i; while (!((h->flags[i>>
4]>>((i&0xfU)<<1))&2) && (((h->
flags[i>>4]>>((i&0xfU)<<1))&1) || !
_kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step
)) & mask; if (i == last) return h->n_buckets; } return
 ((h->flags[i>>4]>>((i&0xfU)<<1))&
3)? h->n_buckets : i; } else return 0; } static inline __attribute__
 ((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t
 new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (
--(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1,
 (new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|=
(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>>
8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
 >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys
 = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets *
 sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return
 -1; } h->keys = new_keys; if (1) { void* *new_vals = (void
**)realloc((void *)h->vals,new_n_buckets * sizeof(void*));
 if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals
; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if
 (((h->flags[j>>4]>>((j&0xfU)<<1))&
3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t
 new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals
[j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<<
1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t
 tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp
; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<
1)); } else { h->keys[i] = key; if (1) h->vals[i] = val
; break; } } } } if (h->n_buckets > new_n_buckets) { h->
keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets
 * sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc
((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h->
flags); h->flags = new_flags; h->n_buckets = new_n_buckets
; h->n_occupied = h->size; h->upper_bound = (khint_t
)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static
 inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t
 *h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied
 >= h->upper_bound) { if (h->n_buckets > (h->size
<<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) <
 0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr
(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets
; } } { khint_t k, i, site, last, mask = h->n_buckets - 1,
 step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
 (!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key
))) { if (((h->flags[i>>4]>>((i&0xfU)<<
1))&1)) site = i; i = (i + (++step)) & mask; if (i ==
 last) { x = site; break; } } if (x == h->n_buckets) { if (
((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && site != h->n_buckets) x = site; else x = i; }
 } } if (((h->flags[x>>4]>>((x&0xfU)<<
1))&2)) { h->keys[x] = key; (h->flags[x>>4]&=
~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->
n_occupied; *ret = 1; } else if (((h->flags[x>>4]>>
((x&0xfU)<<1))&1)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; *ret = 2; } else *ret = 0; return x; } static
 inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t
 *h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
 --h->size; } }
1196 
1197static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1198{
1199	if (params.dim[0] == 0)
1200		return 0;
1201#ifdef HAVE_MPS
1202	if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY)
1203	{
1204		int ret;
1205		const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >>
 12] * ccv_nnc_tensor_count(params);
1206		const obj_ptr_key_t key = {
1207			.ptr = ptr,
1208			.offset = offset,
1209			.size = size,
1210		};
1211		khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret);
1212		if (ret != 0)
1213		{
1214			void* obj = mpobjcreate(ptr, offset, size);
1215			if (!tensor_arena->disposers)
1216				tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1217			ccv_nnc_arena_disposer_t disposer = {
1218				.ptr = obj,
1219				.userdata = 0,
1220				.dispose = _ccv_nnc_tensor_arena_obj_dispose
1221			};
1222			ccv_array_push(tensor_arena->disposers, &disposer);
1223			kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj;
1224			return obj;
1225		} else
1226			return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]);
1227	}
1228#endif
1229	return ptr + offset;
1230}
1231 
1232static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1233{
1234	// All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1235	// Each tensor have the designation in assigned array, and offset in allocated_offset.
1236	const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1237	ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1238	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1239	const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1240	const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1241	const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1242	const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1243	const int unroll_count = graph_prep->unroll_count;
1244	int i, j;
1245	for (i = 0; i < tensor_symbol_info_size; i++)
1246		for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1247		{
1248			const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1249			if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1250				TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
);
1251		}
1252	ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1253	graph_prep->tensor_arena = tensor_arena;
1254	tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1255	tensor_arena->buffers = (void*)(tensor_arena + 1);
1256	tensor_arena->buffer_size = alloc_prep->buffer_size;
1257	tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1258	tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1259	tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1260	tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1261	tensor_arena->pb_vt_tensors = 0;
1262	tensor_arena->vt_alias_r_refs_p = 0;
1263	tensor_arena->vt_alias_r_refs = 0;
1264	tensor_arena->vt_sizes = 0;
1265	tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1266	tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, (sizeof(ccv_nnc_tensor_t) * tensor_symbol_info_size + 15) / 16, 0);
1267	tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1268	tensor_arena->allocator.context.free = allocator.context.free;
1269	tensor_arena->allocator.isa = allocator.isa;
1270	tensor_arena->disposers = 0;
1271	// Copy alias_ref info back to the tensor arena.
1272	for (i = 0; i < tensor_symbol_info_size; i++)
1273		tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1274	// Do the buffer copies.
1275	for (i = 0; i < alloc_prep->buffer_size; i++)
1276		tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1277			tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1278			tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1279	if (graph_prep->while_count_tensor)
1280	{
1281		// If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1282		int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1283		assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
 ({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1283, __extension__ __PRETTY_FUNCTION__
); })); // pos must be 0 position.
1284		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1285		*tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1286	}
1287	assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
 && !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
 && p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1287, __extension__ __PRETTY_FUNCTION__
); }));
1288	if (p_arena && p_graph_prep)
1289	{
1290		// Don't need to allocate the actual buffer, just use the pointer from the above.
1291		PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer assignment for sub arena %p (parent %p)\n",
 tensor_arena, p_arena); fflush(stdout); } } while (0);
1292		for (i = 0; i < tensor_arena->buffer_size; i++)
1293		{
1294			const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1295			int unref_p_ref = p_ref;
1296			while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1297				unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1298			assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
 ({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1298, __extension__ __PRETTY_FUNCTION__
); }));
1299			const int p_unroll_count = p_graph_prep->unroll_count;
1300			if (p_graph_prep->dup_tensor_block_ref &&
1301				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1302				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1303			{
1304				// This condition means in the parent graph, we point to multiple tensor blocks for the same
1305				// buffer, therefore, we cannot have one single pointer assigned in this case.
1306				// Later we will handle this by generate ccv_tensor_multiview_t structure.
1307				tensor_arena->buffers[i].ptr = 0;
1308				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1309				continue;
1310			}
1311			// Otherwise, find the actual buffer pointer.
1312			const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1313			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1313, __extension__ __PRETTY_FUNCTION__); }));
1314			const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1315			if (!p_arena->buffers[buffer_ref].ptr)
1316			{
1317				// Pass it down as 0 ptr.
1318				tensor_arena->buffers[i].ptr = 0;
1319				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1320				continue;
1321			}
1322			const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1323			tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1324			PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
 (0);
1325		}
1326	} else {
1327		// Now, allocate actual buffers.
1328		PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0);
1329		for (i = 0; i < tensor_arena->buffer_size; i++)
1330		{
1331			const int buffer_type = tensor_arena->buffers[i].type;
1332			const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1333#ifdef HAVE_CUDA1
1334			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1335			{
1336				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1337				if (allocator.isa && allocator.isa->alloc)
1338					tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1339				else
1340					tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1341				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1342			} else {
1343				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1343, __extension__ __PRETTY_FUNCTION__
); }));
1344				if (tensor_arena->buffers[i].pin_mem)
1345					tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1346				else
1347					ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1348				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1349			}
1350#elif defined(HAVE_MPS)
1351			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1352			{
1353				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1354				// if (allocator.isa && allocator.isa->alloc)
1355				// 	tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1356				// else
1357				tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1358				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1359			} else {
1360				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1360, __extension__ __PRETTY_FUNCTION__
); }));
1361				ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1362				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1363			}
1364#else
1365			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1365, __extension__ __PRETTY_FUNCTION__
); }));
1366			ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1367			PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1368#endif
1369			assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
 ({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
 ("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1369, __extension__ __PRETTY_FUNCTION__); }));
1370		}
1371	}
1372	// Go over sub_preps and allocate arenas for them. Do it this early because
1373	// we may reference tensors from sub arenas, the reason why we need to reference
1374	// tensors from sub arenas is because for output tensors, sub arena's tensor
1375	// will have automatic reference updates.
1376	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1377		if (graph_prep->sub_preps[i])
1378			tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1379		else
1380			tensor_arena->sub_arenas[i] = 0;
1381	memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1382	// Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1383	ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1384#ifdef HAVE_MPS
1385	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr();
1386#else
1387	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0;
1388#endif
1389	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1390		if (tensor_arena->sub_arenas[i])
1391		{
1392			assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
 ({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1392, __extension__ __PRETTY_FUNCTION__
); }));
1393			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1394			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1395			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1396				for (j = 0; j < node->output_size; j++)
1397				{
1398					const int idx = node->outputs[j];
1399					const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1;
1400					assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
 (s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1400, __extension__ __PRETTY_FUNCTION__); }));
1401					ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1402					assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
 ({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
 ("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1402, __extension__ __PRETTY_FUNCTION__); }));
1403					ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1404					// Only assign if it is a multiview tensor.
1405					if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1406						(sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1407						sub_arena_out_tensors[idx] = sub_tensor;
1408				}
1409		}
1410	// Assigning out the tensors (in case of sharing tensors / in-place ops).
1411	for (i = 0; i < tensor_symbol_info_size; i++)
1412		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
1413		{
1414			const int vt_ref = alloc_prep->vt_blocks[i];
1415			const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1416			// Either we have dup_tensor_block_ref in current layer, or we have that in
1417			// previous layer, therefore, cannot really find the buffer ptr.
1418			if ((!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1419				((graph_prep->dup_tensor_block_ref &&
1420				  graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1421				  graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1422				 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1423			{
1424				assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1424, __extension__ __PRETTY_FUNCTION__
); })); // This must be in a sub-graph.
1425				// If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1426				if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1427					continue;
1428				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1429				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1430				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1431			} else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1432				// When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1433				const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1434				// If already created, use the same tensor, and continue.
1435				// Having ptr.
1436				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1437				ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1438				// Also, set its allocations.
1439				// Since tensor view is bit compatible with tensor, we can just cast.
1440				void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1441				*tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1442				assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1442, __extension__ __PRETTY_FUNCTION__
); }));
1443				// If we need to force broadcast, we need to wrap it in a multiview.
1444				if (graph_prep->tensor_blocks[i].p_refs[0] &&
1445					_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1446				{
1447					const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1448					ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1449					ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1450					ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1451						tv,
1452					}, 0, 1, graph_prep->graph, mv);
1453					CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1454					pos = mv_pos;
1455					ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1456				}
1457				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1458			}
1459		}
1460#ifdef HAVE_MPS
1461	kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map);
1462#endif
1463	// Handle binded tensors. First handle cases without aliases.
1464	for (i = 0; i < tensor_bind_size; i++)
1465	{
1466		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1466, __extension__ __PRETTY_FUNCTION__
); }));
1467		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1468		if (resolved_symbol.d >= 0)
1469		{
1470			int d = resolved_symbol.d;
1471			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1472				continue;
1473			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1474			// It has nothing to do with alias.
1475			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1476				d = tensor_blocks[d].ref - 1;
1477			// For binded tensors, it shouldn't be assigned yet.
1478			// If it is assigned, the pointer should match the ones from the binded tensor.
1479			// This can only happen if an enforced in-place tensor is binded twice. If that
1480			// happens, we need to make sure it is binded to the same location.
1481			assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1481, __extension__ __PRETTY_FUNCTION__
); }));
1482			// See above assertion.
1483			if (tensor_arena->vt_tensors[d])
1484				continue;
1485			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1486			{
1487				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1488				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1489				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1490				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1491					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1492						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1492, __extension__ __PRETTY_FUNCTION__
); })); }
1493				// It is OK to be just as a whole smaller or equal to the binded one.
1494				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1494, __extension__ __PRETTY_FUNCTION__
); }));
1495				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1496				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1497				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1498			} else {
1499				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1500				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1501				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1502				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1503				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1504				tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1505				tv->dataof = tensor_binds[i].tensor->dataof;
1506				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1507			}
1508		}
1509	}
1510	// Handle binded tensors. We handle alias here so it can reference to binded tensors.
1511	for (i = 0; i < tensor_bind_size; i++)
1512	{
1513		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1513, __extension__ __PRETTY_FUNCTION__
); }));
1514		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1515		if (resolved_symbol.d >= 0)
1516		{
1517			int d = resolved_symbol.d;
1518			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1519				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1520			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1521			// It has nothing to do with alias.
1522			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1523				d = tensor_blocks[d].ref - 1;
1524			if (tensor_arena->vt_tensors[d])
1525				continue;
1526			// Assert original alias has no ofs. Otherwise our binding will be problematic.
1527			for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1528				{ assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j]
 == 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol
.d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1528, __extension__ __PRETTY_FUNCTION__
); })); }
1529			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1530			{
1531				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1532				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1533				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1534				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1535					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1536						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1536, __extension__ __PRETTY_FUNCTION__
); })); }
1537				// It is OK to be just as a whole smaller or equal to the binded one.
1538				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1538, __extension__ __PRETTY_FUNCTION__
); }));
1539				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1540				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1541				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1542			} else {
1543				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1544				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1545				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1546				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1547				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1548				tv->data = tensor_binds[i].tensor->data;
1549				tv->dataof = tensor_binds[i].tensor->dataof;
1550				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1551			}
1552		}
1553	}
1554	// Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1555	// Avoiding refs that actually is an alias.
1556	for (i = 0; i < tensor_symbol_info_size; i++)
1557		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1558		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref)
1559		{
1560			int ref = tensor_blocks[i].ref - 1;
1561			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1562				ref = tensor_blocks[ref].ref - 1;
1563			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1563, __extension__ __PRETTY_FUNCTION__); }));
1564			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1565		}
1566	// Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1567	if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1568	{
1569		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1569, __extension__ __PRETTY_FUNCTION__
); }));
1570		const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1571		const int p_idx = graph_prep->p_idx - 1;
1572		for (i = 0; i < node->input_size; i++)
1573		{
1574			const int idx = node->inputs[i];
1575			int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx))) - 1;
1576			assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
 ({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1576, __extension__ __PRETTY_FUNCTION__); }));
1577			const int vt_ref = alloc_prep->vt_blocks[block_ref];
1578			if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1579				continue;
1580			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1580, __extension__ __PRETTY_FUNCTION__); }));
1581			const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1582			assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1582, __extension__ __PRETTY_FUNCTION__); }));
1583			assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1583, __extension__ __PRETTY_FUNCTION__
); }));
1584			// Either we have dup_tensor_block_ref in current layer, or we have that in
1585			// previous layer, therefore, cannot really find the buffer ptr.
1586			if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1587				((graph_prep->dup_tensor_block_ref &&
1588				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1589				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1590				 !tensor_arena->buffers[buffer_ref].ptr))
1591			{
1592				// We haven't allocated anything for this yet.
1593				assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
 ? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1593, __extension__ __PRETTY_FUNCTION__
); }));
1594				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1595				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1596				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1597			} else {
1598				const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1599				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1600				ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1601			}
1602		}
1603	}
1604	// For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1605	// This created the multi-view tensor to achieve that.
1606	for (i = 0; i < tensor_symbol_info_size; i++)
1607		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1608		{
1609			const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1610			// Create phi multi-view.
1611			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1612			const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1613			const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1614			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1615			ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1616			ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1617			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1618				intv,
1619				outv,
1620			}, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1621			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1622			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1623			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1624			ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1625		}
1626	// Now it is time to handle alias.
1627	for (i = 0; i < alloc_prep->block_size; i++)
1628		if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1629		{
1630			const int block_ref = alloc_prep->blocks[i].block_ref;
1631			if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref])
1632			{
1633				// Assigning out the tensor aliases.
1634				assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1634, __extension__ __PRETTY_FUNCTION__
); }));
1635				_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1636			}
1637		}
1638	// Now assigning out the rest of alias refs.
1639	for (i = 0; i < tensor_symbol_info_size; i++)
1640		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1641		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i])
1642		{
1643			int ref = tensor_blocks[i].alias_ref - 1;
1644			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1644, __extension__ __PRETTY_FUNCTION__); }));
1645			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1646		}
1647	// Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1648	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1649		if (tensor_arena->sub_arenas[i])
1650		{
1651			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1652			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1653			for (j = 0; j < node->input_size; j++)
1654			{
1655				const int idx = node->inputs[j];
1656				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1657				if (s_idx < 0)
1658					continue;
1659				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1660				// Only do the replacement if it is a multi-view tensor.
1661				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1662				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1663				{
1664					// It cannot be binded tensor.
1665					assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1665, __extension__ __PRETTY_FUNCTION__
); }));
1666					const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1667					const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1668					ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1669					// If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1670					// to this tensor.
1671					if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1672					{
1673						const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1674						ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1675						ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1676						ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1677						ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1678						ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
 : (multiview)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]);
1679						while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1680							tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
 ? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]);
1681						*ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1682						ref_tensor->data = tv->data;
1683						ref_tensor->dataof = tv->dataof;
1684						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1685					} else
1686						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1687				}
1688			}
1689		}
1690	// After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1691	// No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1692	// when initialize case..of node, which will take the phi multi-view again.
1693	for (i = 0; i < tensor_symbol_info_size; i++)
1694		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1695		{
1696			assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
 & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1696, __extension__ __PRETTY_FUNCTION__
); }));
1697			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1698			assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1698, __extension__ __PRETTY_FUNCTION__); }));
1699			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1700		}
1701	// rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1702	for (i = 0; i < tensor_symbol_info_size; i++)
1703		if (tensor_arena->vt_tensors[i])
1704			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1705	// Associate multiview tensors from sub arena to the parent.
1706	if (sub_arena_out_tensors)
1707	{
1708		for (i = 0; i < alloc_prep->block_size; i++)
1709			if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1710			{
1711				const int block_ref = alloc_prep->blocks[i].block_ref;
1712				if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1713					continue;
1714				int sub_arena_ref = block_ref;
1715				if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1716				{
1717					// Assigning out the tensor aliases.
1718					assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1718, __extension__ __PRETTY_FUNCTION__
); }));
1719					const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1720					// It referenced to is not an alias.
1721					assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
 0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1721, __extension__ __PRETTY_FUNCTION__
); }));
1722					sub_arena_ref = alias_ref;
1723					if (!sub_arena_out_tensors[sub_arena_ref])
1724						continue;
1725				}
1726				if (!sub_arena_out_tensors[sub_arena_ref])
1727					continue;
1728				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1729				assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1729, __extension__ __PRETTY_FUNCTION__); }));
1730				// This is only possible if the vt_tensors is a phi node.
1731				if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1732				{
1733					// For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1734					ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1735					assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
 ("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1735, __extension__ __PRETTY_FUNCTION__); }));
1736					assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
 ? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1736, __extension__ __PRETTY_FUNCTION__
); }));
1737					CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]->alias_ref = (uintptr_t)mv;
1738					ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]);
1739				} else {
1740					tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1741					ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1742				}
1743			}
1744	}
1745	// Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1746	// 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1747	// 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1748	// Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1749	// to the output of assign_ref tensor.
1750	for (i = 0; i < tensor_symbol_info_size; i++)
1751		if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1752		{
1753			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1754			ccv_nnc_tensor_t* assign_tensor;
1755			if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1756				assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1757			else
1758				assign_tensor = tensor_arena->vt_tensors[assign_ref];
1759			ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1760		}
1761	// After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1762	for (i = 0; i < tensor_bind_size; i++)
1763	{
1764		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1764, __extension__ __PRETTY_FUNCTION__
); }));
1765		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1766		if (resolved_symbol.d >= 0)
1767		{
1768			int d = resolved_symbol.d;
1769			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1770			// It has nothing to do with alias.
1771			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1772				d = tensor_blocks[d].ref - 1;
1773			// Note we don't trace back on alias. This is intentional.
1774			assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 ==
 tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ (
{ if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1774, __extension__ __PRETTY_FUNCTION__
); }));
1775		}
1776	}
1777	if (sub_arena_out_tensors)
1778		ccfreefree(sub_arena_out_tensors);
1779	// Rewire sub arena's tensor references.
1780	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1781		if (tensor_arena->sub_arenas[i])
1782		{
1783			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1784			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1785			for (j = 0; j < node->input_size; j++)
1786			{
1787				const int idx = node->inputs[j];
1788				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1789				if (s_idx < 0)
1790					continue;
1791				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1792				// Only do the replacement if it is a multi-view tensor.
1793				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1794				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1795				{
1796					// This is binded tensor, bind it now.
1797					if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1798						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1799					else
1800						_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1801				}
1802			}
1803		}
1804	return tensor_arena;
1805}
1806 
1807static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1808{
1809	assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
 ; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1809, __extension__ __PRETTY_FUNCTION__); }));
1810	if ((intptr_t)graph == tensor_arena->graph_ref)
1811	{
1812		assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >=
 0 && pair_ref < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1812, __extension__ __PRETTY_FUNCTION__
); }));
1813		return tensor_arena->vt_tensors[pair_ref];
1814	}
1815	int i;
1816	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1817		if (tensor_arena->sub_arenas[i])
1818		{
1819			ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1820			if (tensor)
1821				return tensor;
1822		}
1823	return 0;
1824}
1825 
1826static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1827{
1828	if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1829		tensor->type |= CCV_TAPE_ALLOC;
1830	else {
1831		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1832		mv->type |= CCV_TAPE_ALLOC;
1833		int i;
1834		for (i = 0; i < mv->repeat + mv->kind; i++)
1835			_ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1836	}
1837}
1838 
1839static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1840{
1841	assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
 __assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1841, __extension__ __PRETTY_FUNCTION__
); }));
1842	int i;
1843	for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1844	{
1845		if (graph_prep->tensor_symbol_info[i].pair_ref)
1846		{
1847			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1848			// No need to continue check this if it is from its pair.
1849			continue;
1850		}
1851		if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1852		{
1853			// If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1854			if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
))
1855			{
1856				const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1857				if (vt_ref >= 0 &&
1858					TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc) == READ_ONLY)
1859					continue;
1860			}
1861			_ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1862		}
1863	}
1864	for (i = 0; i < graph_prep->sub_prep_size; i++)
1865		if (graph_prep->sub_preps[i])
1866			_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1867}
1868 
1869static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1870{
1871	int i, found = 0;
1872	// Try to insert head.
1873	ccv_array_t* head = tensor_blocks.head;
1874	assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
 else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1874, __extension__ __PRETTY_FUNCTION__); }));
1875	for (i = 0; i < head->rnum;)
1876	{
1877		const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i)));
1878		if (head_idx == idx)
1879		{
1880			found = 1;
1881			break;
1882		}
1883		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1884		if (cell.i32 && cell.i32[0] > 0)
1885		{
1886			/* If the current node is the parent of the head node, check if we found it or not. */
1887			/* If not found, replace the current one. */
1888			if (!found)
1889			{
1890				found = 1;
1891				*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = idx;
1892			} else {
1893				/* Remove the current one, change the rnum. */
1894				if (i < head->rnum - 1)
1895					*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(head->rnum - 1)));
1896				--head->rnum;
1897				continue;
1898			}
1899		} else {
1900			// If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1901			cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1902			if (cell.i32 && cell.i32[0] > 0)
1903			{
1904				found = 1;
1905				break;
1906			}
1907		}
1908		/* Advancing i. */
1909		++i;
1910	}
1911	/* If not found, push this idx to the end of the array. */
1912	if (!found)
1913		ccv_array_push(head, &idx);
1914	// Try to insert tail.
1915	found = 0;
1916	ccv_array_t* tail = tensor_blocks.tail;
1917	assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
 else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1917, __extension__ __PRETTY_FUNCTION__); }));
1918	for (i = 0; i < tail->rnum;)
1919	{
1920		const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i)));
1921		if (tail_idx == idx)
1922		{
1923			found = 1;
1924			break;
1925		}
1926		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1927		if (cell.i32 && cell.i32[0] > 0)
1928		{
1929			/* If the current node is the child of the tail node, check if we found it or not. */
1930			/* If not found, replace the current one. */
1931			if (!found)
1932			{
1933				found = 1;
1934				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = idx;
1935			} else {
1936				/* Remove the current one, change the rnum. */
1937				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(tail->rnum - 1)));
1938				--tail->rnum;
1939				continue;
1940			}
1941		} else {
1942			// If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1943			cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1944			if (cell.i32 && cell.i32[0] > 0)
1945			{
1946				found = 1;
1947				break;
1948			}
1949		}
1950		/* Advancing i. */
1951		++i;
1952	}
1953	/* If not found, push this idx to the end of the array. */
1954	if (!found)
1955		ccv_array_push(tail, &idx);
1956}
1957 
1958ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1959{
1960	if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1961	{
1962		assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
 0 && symbol.d < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1962, __extension__ __PRETTY_FUNCTION__
); }));
1963		ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1964		if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1965		{
1966			ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1967			while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1968				mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1969			return (ccv_nnc_tensor_t*)mv;
1970		}
1971		return tensor;
1972	}
1973	int i;
1974	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1975		if (tensor_arena->sub_arenas[i])
1976		{
1977			ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1978			if (tensor)
1979				return tensor;
1980		}
1981	return 0;
1982}
1983 
1984ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1985{
1986	if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1987	{
1988		assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
 >= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 1988, __extension__ __PRETTY_FUNCTION__
); }));
1989		return graph_exec_arena->graph_execs[symbol.d];
1990	}
1991	int i;
1992	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
1993		if (graph_exec_arena->sub_arenas[i])
1994		{
1995			ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1996			if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
1997				return exec;
1998		}
1999	return (ccv_nnc_graph_exec_t){}; // 0.
2000}
2001 
2002ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2003{
2004	return graph_exec_arena->source;
2005}
2006 
2007ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2008{
2009	return graph_exec_arena->destination;
2010}
2011 
2012// Check whether the head is the beginning of this block.
2013static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
2014{
2015	assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
 ({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 2015, __extension__ __PRETTY_FUNCTION__
); }));
2016	return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0))) == head_node);
2017}
2018 
2019// Check whether the tail is the end of this block.
2020static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2021{
2022	assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
 ({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 2022, __extension__ __PRETTY_FUNCTION__
); }));
2023	return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0))) == tail_node);
2024}
2025 
2026// Make two tensor blocks one. Return 1 if that happened.
2027static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2028{
2029	// Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2030	if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
2031		(!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
2032		tensor_blocks[p_ref_0].tail->rnum == 1 &&
2033		tensor_blocks[p_ref_1].head->rnum == 1 &&
2034		tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
2035		*(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
) == *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
))
2036	{
2037		// If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2038		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 2038, __extension__ __PRETTY_FUNCTION__); }));
2039		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 2039, __extension__ __PRETTY_FUNCTION__); }));
2040		ccv_array_free(tensor_blocks[p_ref_0].tail);
2041		tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2042		if (tensor_blocks[p_ref_1].p_refs[0])
2043		{
2044			assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
 0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
 0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2044, __extension__ __PRETTY_FUNCTION__
); })); // It simply cannot have more than one p_refs, otherwise we cannot merge.
2045			if (!tensor_blocks[p_ref_0].p_refs[0])
2046				tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2047			else
2048				tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2049		}
2050		tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2051		TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
 & ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)));
2052		ccv_array_free(tensor_blocks[p_ref_1].head);
2053		if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
2054			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
 | UNFOLDABLE_AS_INPUT));
2055		// Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2056		TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
 & ~0x3) | UNASSIGNED));
2057		tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2058		if (!tensor_blocks[p_ref_0].r_refs)
2059			tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2060		ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2061		tensor_blocks[p_ref_1].size = 0;
2062		tensor_blocks[p_ref_1].head = 0;
2063		tensor_blocks[p_ref_1].tail = 0;
2064		return 1;
2065	}
2066	return 0;
2067}
2068 
2069static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2070{
2071	int i, j, k;
2072	// Generate exec dependencies (or, in other words, partial ordering of executions).
2073	ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2074	int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2075	int buf_size;
2076	if (p_node_info)
2077		{ assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
 if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2077, __extension__ __PRETTY_FUNCTION__
); })); }
2078#define for_block(x, val) \
2079	do { \
2080		if (((int32_t*)val)[0] > 0) \
2081		{ \
2082			buf[buf_size * 2] = x; \
2083			buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2084			++buf_size; \
2085		} \
2086	} while (0)
2087	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx; {
2088		buf_size = 0; /* save all its parent deps to this buffer */
2089		ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2090		if (vector)
2091			CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
2092		if (!node->outgoings)
2093			continue;
2094		for (i = 0; i < node->outgoings->rnum; i++)
2095		{
2096			int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2097			const int32_t one = 1;
2098			ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2099			/* If not found, set, if the current node is the destination node, no need 
2100			 * set itself as parent of subsequent nodes because its terminal nature. */
2101			if (!cell.i32 || cell.i32[0] == 0)
2102				ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2103			if (buf_size > 0)
2104			{
2105				ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2106				assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector
) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c"
, 2106, __extension__ __PRETTY_FUNCTION__); }));
2107				for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
2108				{
2109					ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2110					/* If not found, set */
2111					if (!cell.i32 || cell.i32[0] == 0)
2112						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2113					else {
2114						/* Otherwise, set to the longest one */
2115						int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
 + 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; });
2116						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2117					}
2118				}
2119			}
2120		}
2121	} ccv_nnc_graph_visit_endfor} }
2122#undef for_block
2123	ccfreefree(buf);
2124	// This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2125	const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2126	ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2127	// The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2128	// happens that I have to loop through all relevant node to find out if one is used or not.
2129	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2130		tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2131	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2132		for (i = 0; i < node->input_size; i++)
2133			if (node->inputs[i] >= 0)
2134			{
2135				tensor_blocks[node->inputs[i]].flags = 0;
2136				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2137				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2138				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2139					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2140					tensor_blocks[node->inputs[i]].pin_mem = 1;
2141			}
2142		for (i = 0; i < node->output_size; i++)
2143			if (node->outputs[i] >= 0)
2144			{
2145				tensor_blocks[node->outputs[i]].flags = 0;
2146				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2147				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2148				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2149					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2150					tensor_blocks[node->outputs[i]].pin_mem = 1;
2151			}
2152	} ccv_nnc_graph_visit_endfor} }
2153	if (p_node_info)
2154	{
2155		assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
 ({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 2155, __extension__ __PRETTY_FUNCTION__
); }));
2156		// Mark it as used if it is used in either input or output.
2157		for (i = 0; i < p_node_info->input_size; i++)
2158			if (p_node_info->inputs[i] >= 0)
2159			{
2160				const int d = p_node_info->inputs[i];
2161				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2162				{
2163					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2164					if (dd >= 0) // If this exists in this sub-graph, great.
2165						tensor_blocks[dd].flags = 0;
2166				}
2167			}
2168		for (i = 0; i < p_node_info->output_size; i++)
2169			if (p_node_info->outputs[i] >= 0)
2170			{
2171				const int d = p_node_info->outputs[i];
2172				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2173				{
2174					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2175					if (dd >= 0) // If this exists in this sub-graph, great.
2176						tensor_blocks[dd].flags = 0;
2177				}
2178			}
2179	}
2180	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2181		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2182		{
2183			// Check no tensor info is auto now.
2184			assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 2184, __extension__ __PRETTY_FUNCTION__
); }));
2185			// If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2186			// therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2187			// fold to).
2188			if (tensor_symbol_info[i].assign_ref)
2189			{
2190				// TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2191				// It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2192				// it kept its own representation, which is not the case for output).
2193				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2194				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2195				// But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2196				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT));
2197				// It also cannot be folded as output (except i), because we need to keep its own representation.
2198				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2199				assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
 == 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2199, __extension__ __PRETTY_FUNCTION__
); }));
2200				tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2201				for (j = 0; j < unroll_count; j++)
2202				{
2203					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT));
2204					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT));
2205				}
2206				if (tensor_blocks[assign_ref].bypass_ref)
2207				{
2208					// If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2209					tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2210					const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2211					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT));
2212					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2213					// On the other hand, it can be folded into the except_ref for the bypass_ref.
2214					tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2215					if (dup_tensor_from_ref)
2216					{
2217						const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2218						if (bypass_from_ref >= 0)
2219						{
2220							TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT));
2221							TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT));
2222							assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
 + unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
 - 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2222, __extension__ __PRETTY_FUNCTION__
); }));
2223							for (j = 0; j < unroll_count - 1; j++)
2224							{
2225								// Mark every incarnation as unfold-able.
2226								TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_INPUT));
2227								TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT));
2228							}
2229						}
2230					}
2231				}
2232			}
2233		}
2234	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2235	{
2236		// If it has a pair reference, we don't need to allocate this tensor at all,
2237		// set it to be unassigned.
2238		if (tensor_symbol_info[i].pair_ref)
2239			TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED));
2240		// If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2241		else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2242			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2243			TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2244			// For this case, there is no exception.
2245			tensor_blocks[i].unfoldable_except_ref = 0;
2246		} else if (tensor_symbol_info[i].p_ref) {
2247			assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 2247, __extension__ __PRETTY_FUNCTION__); }));
2248			const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2249			// If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2250			if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2251				// TODO: This check can be lifted if we can fold in the parent graph.
2252				if (-1 == p_ref_is_in_or_out)
2253					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2254			if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2255				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2256		}
2257	}
2258	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2259	{
2260		if (tensor_symbol_info[i].alias_ref)
2261		{
2262			const int ref = tensor_symbol_info[i].alias_ref - 1;
2263			// If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2264			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2265				tensor_blocks[ref].flags = 0;
2266			// An alias cannot ref to another alias.
2267			assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
 __assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2267, __extension__ __PRETTY_FUNCTION__); }));
2268			tensor_blocks[i].flags = ALIAS;
2269			tensor_blocks[i].ref = ref + 1; // Assign the ref.
2270			if (!tensor_blocks[ref].r_refs)
2271				tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2272			ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2273		}
2274	}
2275	// Scan again and if the ref is not assigned, mark the alias not assigned.
2276	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2277		if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2278		{
2279			const int ref = tensor_blocks[i].ref - 1;
2280			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2281			{
2282				// Mark this as unassigned.
2283				tensor_blocks[i].flags = UNASSIGNED;
2284				tensor_blocks[i].ref = 0;
2285			}
2286		}
2287	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2288	{
2289		// If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2290		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
2291		{
2292			tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2293			tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2294			// Cache tensor size (align to 16 bytes).
2295			tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2296		}
2297		// If there is a p_ref, add the one to the p_refs list.
2298		if (tensor_symbol_info[i].p_ref)
2299			tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2300	}
2301	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2302		for (i = 0; i < node->input_size; i++)
2303		{
2304			int d = node->inputs[i];
2305			if (d < 0)
2306				continue;
2307			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2308				d = tensor_symbol_info[d].alias_ref - 1;
2309			tensor_blocks[d].flags |= READ_ONLY;
2310			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2311				continue;
2312			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2312, __extension__ __PRETTY_FUNCTION__
); }));
2313			/* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2314			 * from the very beginning of the graph life-cycle and ends here. */
2315			if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
2316			{
2317				for (j = 0; j < source_size; j++)
2318				{
2319					// If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2320					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2321					if (cell.i32 && cell.i32[0] > 0)
2322						_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2323				}
2324				/* If this is a read-only (based on SSA, if first encountered as read), and this is
2325				 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2326				 * loop, however, in that case, you need to prevent read-only gets reused for the
2327				 * output tensor, which is not obvious how to implement correctly), and it is not
2328				 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2329				 * of memory anyway (because on second loop, we want to read the same value out).
2330				 * Mark it to the end of the graph. */
2331				if (p_node_info && !tensor_symbol_info[d].assign_ref)
2332					for (j = 0; j < destination_size; j++)
2333					{
2334						// If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2335						const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2336						if (cell.i32 && cell.i32[0] > 0)
2337							_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2338					}
2339			}
2340			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2341		}
2342		for (i = 0; i < node->output_size; i++)
2343		{
2344			int d = node->outputs[i];
2345			if (d < 0)
2346				continue;
2347			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2348				d = tensor_symbol_info[d].alias_ref - 1;
2349			tensor_blocks[d].flags |= WRITE_ONLY;
2350			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2351				continue;
2352			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2352, __extension__ __PRETTY_FUNCTION__
); }));
2353			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2354		}
2355	} ccv_nnc_graph_visit_endfor} }
2356	// For any assign_ref, its life-time kept until the end and wrap over.
2357	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2358		// If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2359		// that "somewhere else" need to keep its life-time til the end.
2360		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) &&
2361			p_node_info && tensor_symbol_info[i].assign_ref)
2362		{
2363			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2364			for (j = 0; j < destination_size; j++)
2365			{
2366				// This logic is to be more conservative about which destination we add to.
2367				// As of now, if we add everything, it is fine most likely. However, it may
2368				// cause issues in the future to do so naively. Thus, instead, we only add
2369				// the destination to it iff either the tensor is not used at all, or, the
2370				// destination is on the same stream as of the tensor block some way.
2371				int flag = !tensor_blocks[assign_ref].tail;
2372				for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2373				{
2374					const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
 + (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)));
2375					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2376					flag = (cell.i32 && cell.i32[0] > 0);
2377				}
2378				if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2379					_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2380			}
2381		}
2382	for (i = 0; i < output_size; i++)
2383	{
2384		assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
 __assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2384, __extension__ __PRETTY_FUNCTION__); }));
2385		int d = outputs[i].d;
2386		if (d < 0)
2387			continue;
2388		if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2389			d = tensor_symbol_info[d].alias_ref - 1;
2390		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2391			continue;
2392		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2392, __extension__ __PRETTY_FUNCTION__
); }));
2393		for (j = 0; j < destination_size; j++)
2394		{
2395			int flag = !tensor_blocks[d].tail;
2396			for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2397			{
2398				const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
2399				const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2400				flag = (cell.i32 && cell.i32[0] > 0);
2401			}
2402			if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2403				_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2404		}
2405	}
2406	// Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2407	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2408		int x, y;
2409		for (x = 0; x < node->input_size; x++)
2410			for (y = 0; y < node->output_size; y++)
2411				/* Some operations enforces some tensors to be the same for inputs / outputs. */
2412				if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2413				{
2414					// If both unassigned, it is fine.
2415					if (node->inputs[x] < 0 && node->outputs[y] < 0)
2416						continue;
2417					int ref = node->inputs[x];
2418					assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2418, __extension__ __PRETTY_FUNCTION__); }));
2419					while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2420						ref = tensor_blocks[ref].ref - 1;
2421					const int node_output_y = node->outputs[y];
2422					assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
 ({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2422, __extension__ __PRETTY_FUNCTION__
); }));
2423					// If both are not computable, it is fine, we don't need to enforce.
2424					if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2425						!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
)))
2426						continue;
2427					// Otherwise, enforce and error out if failed.
2428					if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2429						{ assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2429, __extension__ __PRETTY_FUNCTION__
); })); }
2430				}
2431	} ccv_nnc_graph_visit_endfor} }
2432	// Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2433	// we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2434	// that is not enforced in-place (because the tensor enforced in-place will be different than the
2435	// binding one).
2436	for (i = 0; i < tensor_bind_size; i++)
2437	{
2438		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2439		// If there is a tensor binded, then it is unassigned.
2440		if (resolved_symbol.d >= 0)
2441		{
2442			int d = resolved_symbol.d;
2443			// I cannot assert too much at this moment.
2444			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2445				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2446			// This check is for in-place ops. Only in-place op could have unassigned but ref.
2447			// It has nothing to do with alias.
2448			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2449				d = tensor_blocks[d].ref - 1;
2450			// Doesn't work if this is a loop carrying variable.
2451			assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
 __extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
 __assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2451, __extension__ __PRETTY_FUNCTION__); }));
2452			tensor_blocks[d].flags = UNASSIGNED;
2453			tensor_blocks[d].ref = 0; // No need to have ref as well.
2454		}
2455	}
2456	// Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2457	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2458		int x, y;
2459		for (x = 0; x < node->input_size; x++)
2460		{
2461			/* If the input is not assigned, it can be referenced, find the referenced one */
2462			int ref = node->inputs[x];
2463			if (ref < 0)
2464				continue;
2465			const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2466			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2467				ref = tensor_blocks[ref].ref - 1;
2468			assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
 ({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2468, __extension__ __PRETTY_FUNCTION__
); }));
2469			if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2470				tensor_blocks[ref].tail->rnum == 1)
2471			{
2472				for (y = 0; y < node->output_size; y++)
2473					/* Only proceed if the input symbol is different from the output symbol, */
2474					/* and the input symbol meets the output symbol exactly at the same spot. */
2475					if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2476						node->outputs[y] >= 0 &&
2477						ref != node->outputs[y] &&
2478						TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
 0x3) == UNASSIGNED)))
2479					{
2480						const int node_output_y = node->outputs[y];
2481						const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2482						/* If dimension matches perfectly, then we can assign y_symbol to x.
2483						 * If both of them are aliases, making sure their origin matches in size too. */
2484						if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
2485						{
2486							_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2487							// This refers to an alias itself, now mark it and will be processed later.
2488							if (ref != node->inputs[x])
2489								tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2490						}
2491					}
2492			}
2493		}
2494	} ccv_nnc_graph_visit_endfor} }
2495	// Specifically handle the bypass. This need to be done after the first pass.
2496	// I need to extend the bypass life-time to the same as the one I am going with.
2497	// It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2498	ccv_nnc_tensor_block_t empty_block = {};
2499	empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2500	empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2501	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2502		if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2503		{
2504			int can_bypass = 1;
2505			for (i = 0; can_bypass && i < node->output_size; i++)
2506			{
2507				int d = node->outputs[i];
2508				if (d < 0)
2509					continue;
2510				if (!tensor_blocks[d].bypass_ref)
2511					continue;
2512				while (tensor_blocks[d].ref)
2513					d = tensor_blocks[d].ref - 1;
2514				int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2515				while (tensor_blocks[bypass_ref].ref)
2516					bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2517				// If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2518				if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2519					continue;
2520				ccv_array_clear(empty_block.head);
2521				for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2522					ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
 + (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j))));
2523				ccv_array_clear(empty_block.tail);
2524				for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2525					ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
 + (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j))));
2526				for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2527					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), empty_block);
2528				for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2529					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), empty_block);
2530				// It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2531				assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
 ({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
 ("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2531, __extension__ __PRETTY_FUNCTION__
); }));
2532				int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2533				while (tensor_blocks[b_ref].ref)
2534					b_ref = tensor_blocks[b_ref].ref - 1;
2535				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2536				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2537				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2538				// even after we extend the life-time of bypass_ref. Then we are in a good shape.
2539				can_bypass = can_bypass && (a_hop_b || b_hop_a);
2540			}
2541			if (can_bypass)
2542			{
2543				for (i = 0; i < node->output_size; i++)
2544				{
2545					int d = node->outputs[i];
2546					if (d < 0)
2547						continue;
2548					if (!tensor_blocks[d].bypass_ref)
2549						continue;
2550					while (tensor_blocks[d].ref)
2551						d = tensor_blocks[d].ref - 1;
2552					int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2553					while (tensor_blocks[bypass_ref].ref)
2554						bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2555					// The bypass_ref can extend its life-time.
2556					for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2557						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2558					for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2559						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2560				}
2561			} else {
2562				for (i = 0; i < node->output_size; i++)
2563					tensor_blocks[node->outputs[i]].bypass_ref = 0;
2564				const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2565				// Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2566				exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2567			}
2568		}
2569	} ccv_nnc_graph_visit_endfor} }
2570	ccv_array_free(empty_block.head);
2571	ccv_array_free(empty_block.tail);
2572	*r_exec_dep = exec_dep;
2573	*r_tensor_blocks = tensor_blocks;
2574}
2575 
2576static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2577{
2578	if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2579	{
2580		ccv_nnc_cmd_t retval = cmd;
2581		retval.cmd = CCV_NNC_NOOP;
2582		return retval;
2583	}
2584	return cmd;
2585}
2586 
2587static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2588{
2589	if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2590	{
2591		if (tensor_symbol_info[input].alias_ref)
2592		{
2593			const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2594			assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2594, __extension__ __PRETTY_FUNCTION__
); }));
2595			ccv_nnc_tensor_symbol_t tensor_symbol = {};
2596			if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2597			{
2598				tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2599				if (tensor_symbol_info[alias_ref].pair_ref)
2600					ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2601						.d = tensor_symbol_info[alias_ref].pair_ref - 1,
2602						.graph = dup_graph->pair
2603					});
2604				ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2605				dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2606			} else {
2607				tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2608				tensor_symbol.graph = dup_graph;
2609			}
2610			ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2611			if (tensor_symbol_info[input].pair_ref)
2612				ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2613					.d = tensor_symbol_info[input].pair_ref - 1,
2614					.graph = dup_graph->pair
2615				});
2616			ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2617			dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2618		} else {
2619			ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2620			if (tensor_symbol_info[input].pair_ref)
2621				ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2622					.d = tensor_symbol_info[input].pair_ref - 1,
2623					.graph = dup_graph->pair
2624				});
2625			ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2626			dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2627		}
2628		if (tensor_symbol_info[input].bypass_ref)
2629		{
2630			const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2631			assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
 ({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2631, __extension__ __PRETTY_FUNCTION__
); }));
2632			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])));
2633			symbol_info->bypass_ref = dup_bypass_ref + 1;
2634		}
2635	}
2636	return (ccv_nnc_tensor_symbol_t) {
2637		.d = dup_tensor_block_ref[input * unroll_count],
2638		.graph = dup_graph,
2639	};
2640}
2641 
2642static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2643{
2644	int i;
2645	if (dup_exec_ref[idx * unroll_count] < 0)
2646	{
2647		// Input has to come before output, because output could has a bypass reference to the input.
2648		for (i = 0; i < node->input_size; i++)
2649			max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2650		for (i = 0; i < node->output_size; i++)
2651			max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2652		ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2653		dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2654	}
2655	return (ccv_nnc_graph_exec_symbol_t) {
2656		.d = dup_exec_ref[idx * unroll_count],
2657		.graph = dup_graph,
2658	};
2659}
2660 
2661static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2662{
2663	int i;
2664	for (i = 0; i < tensor_block_size; i++)
2665	{
2666		if (tensor_blocks[i].head)
2667			ccv_array_free(tensor_blocks[i].head);
2668		if (tensor_blocks[i].tail)
2669			ccv_array_free(tensor_blocks[i].tail);
2670		if (tensor_blocks[i].r_refs)
2671			ccv_array_free(tensor_blocks[i].r_refs);
2672		if (tensor_blocks[i].dup_p_refs)
2673			ccv_array_free(tensor_blocks[i].dup_p_refs);
2674	}
2675	ccfreefree(tensor_blocks);
2676}
2677 
2678// Find tensors that cannot be solved by co-allocating to the same location.
2679static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2680{
2681	int i, j, unroll_count = 0;
2682	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2683		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2684		{
2685			// This is is a parameter, thus, it has to be either an alias or used.
2686			assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
 & 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
 ("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2686, __extension__ __PRETTY_FUNCTION__
); }));
2687			const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2688			// The parameter it assign to has to be either an alias or used.
2689			assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2689, __extension__ __PRETTY_FUNCTION__
); }));
2690			// If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2691			// If it is the same, we are good, no need to extend.
2692			int a_ref = i;
2693			while (tensor_blocks[a_ref].ref)
2694				a_ref = tensor_blocks[a_ref].ref - 1;
2695			int b_ref = assign_ref;
2696			while (tensor_blocks[b_ref].ref)
2697				b_ref = tensor_blocks[b_ref].ref - 1;
2698			if (a_ref != b_ref)
2699			{
2700				// If any of the b's head is deterministically later than a's tail
2701				// or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2702				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2703				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2704				// It cannot be that both i can hop to j can j can hop to i.
2705				assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
 ? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
 > 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2705, __extension__ __PRETTY_FUNCTION__
); }));
2706				// Can it be folded
2707				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2708				if (a_hop_b || b_hop_a)
2709				{
2710					tensor_blocks[a_ref].companion_ref = b_ref + 1;
2711					tensor_blocks[b_ref].companion_ref = a_ref + 1;
2712					continue;
2713				}
2714				int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2715				for (j = 0; c_ref >= 0; j++)
2716				{
2717					while (tensor_blocks[c_ref].ref)
2718						c_ref = tensor_blocks[c_ref].ref - 1;
2719					c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2720				}
2721				unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
 = (j + 1); (_a > _b) ? _a : _b; });
2722			}
2723		}
2724	// Reset companion_ref if need to unroll.
2725	if (unroll_count)
2726		for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2727			tensor_blocks[j].companion_ref = 0;
2728	return unroll_count;
2729}
2730 
2731static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2732{
2733	int i, j, n;
2734	// The inout exec nodes, these are the nodes we are going to extend.
2735	uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2736	int max_input_size = 0;
2737	int max_output_size = 0;
2738	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2739	{
2740		max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; });
2741		max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; });
2742	}
2743	ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
2744	ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
2745	// Doing graph expansion
2746	// It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2747	assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2747, __extension__ __PRETTY_FUNCTION__
); }));
2748	assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2748, __extension__ __PRETTY_FUNCTION__
); }));
2749#define INCOMING_NODE (1)
2750#define OUTGOING_NODE (2)
2751	// Unroll the graph n times.
2752	for (n = 0; n < unroll_count; n++)
2753	{
2754		int* const dup_exec_ref = r_dup_exec_ref + n;
2755		const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2756		int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2757		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2758			dup_exec_ref[i * unroll_count] = -1;
2759		for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2760		{
2761			// If there is a assign_ref, that means I don't need to dup the tensor.
2762			if (tensor_symbol_info[i].assign_ref)
2763			{
2764				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2765				dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2766			} else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2767			// If this is a read-only tensor block, no need to duplicate because the value never changes
2768			// (note we handled assign_ref first), therefore, no need to generate duplicate.
2769				dup_tensor_block_ref[i * unroll_count] = i;
2770			else
2771				dup_tensor_block_ref[i * unroll_count] = -1;
2772		}
2773		// Go through the original graph, make copies of the node if it is inout.
2774		ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2775			ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2776			inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2777			if (!node->outgoings)
2778				continue;
2779			for (i = 0; i < node->outgoings->rnum; i++)
2780			{
2781				const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2782				inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2783				ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2784				ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2785			}
2786		} ccv_nnc_graph_visit_endfor} }
2787		// Check the visitor are all marked as either incoming or outgoing.
2788		const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2789		const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2790		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2791		{
2792			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2793				continue;
2794			assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
 OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
 INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
 ("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2794, __extension__ __PRETTY_FUNCTION__
); }));
2795			// If this is pure incoming nodes, then I need to concat this one with all original destination node
2796			if (inout[i] == INCOMING_NODE)
2797				for (j = 0; j < dup_destination_size; j++)
2798				{
2799					ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2800						.d = dup_destinations[j].d,
2801						.graph = dup_graph,
2802					}, (ccv_nnc_graph_exec_symbol_t) {
2803						.d = dup_exec_ref[i * unroll_count],
2804						.graph = dup_graph,
2805					});
2806				}
2807		}
2808		if (dup_graph->destinations)
2809			ccv_array_clear(dup_graph->destinations);
2810		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2811		{
2812			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2813				continue;
2814			const int d = dup_exec_ref[i * unroll_count];
2815			ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
 + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)));
2816			// If this has no outgoing node, add to the destination.
2817			if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2818				ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2819					.graph = dup_graph,
2820					.d = d,
2821				});
2822		}
2823	}
2824#undef INCOMING_NODE
2825#undef OUTGOING_NODE
2826	ccfreefree(inout);
2827}
2828 
2829static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2830{
2831	int i;
2832	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2833		// Now can assign them (The dup) as companion.
2834		// Get to the last one, which we will wrap over.
2835		if (dup_tensor_symbol_info[i].assign_ref)
2836		{
2837			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2838			dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2839			assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
 ; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2839, __extension__ __PRETTY_FUNCTION__
); }));
2840			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2841		}
2842}
2843 
2844// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2845// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2846// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2847static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2848{
2849	int i, j, k;
2850	for (i = 0; i < p_node_info->output_size; i++)
2851	{
2852		const int d = p_node_info->outputs[i];
2853		const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx))) - 1;
2854		if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED)))
2855			continue;
2856		for (k = 0; k < destination_size; k++)
2857			_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2858		// Add the duplicated destinations to the tensor_block_ref.
2859		for (j = 0; j < unroll_count; j++)
2860			for (k = 0; k < destination_size; k++)
2861			{
2862				const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2863				const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2864				if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2865					_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2866			}
2867	}
2868}
2869 
2870static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2871{
2872	int i, j;
2873	ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2874	ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2875	// blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2876	// Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2877	// No need to change anything, we are good.
2878	const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2879	if (!unroll_count)
2880		return;
2881	// Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2882	// Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2883	ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2884	int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2885	int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2886	_ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2887	ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2888	ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2889	ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
(dup_graph->exec_symbol_info->rnum) + _incoming_edges_ >
 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
 = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = {
 (dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1
; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for
 (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
 _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue
; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r =
 1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) {
 ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph
->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1
; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_
[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((
_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum
)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->destinations->rnum); _exist_size_[1] =
 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
 for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
 = 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
 int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
 6 && _d_ < (dup_graph->destinations->rnum))
 { _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
 < ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 6 && _d_ <
 (dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0
), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->
rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
 ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
 ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
 ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
 <= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
 ({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2889, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
2890	ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2891	_ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2892	// Free out the old exec_dep
2893	ccv_matrix_free(exec_dep);
2894	// and the tensor blocks, prepare for the new.
2895	_ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2896	// A reverse map to find where the original tensor comes from.
2897	int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2898	for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2899		dup_tensor_from_ref[i] = -1;
2900	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2901		for (j = 0; j < unroll_count; j++)
2902			if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2903				dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2904	int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2905	for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2906		dup_exec_from_ref[i] = -1;
2907	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2908	{
2909		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2910			continue;
2911		dup_exec_from_ref[i] = i; // Reference back.
2912		for (j = 0; j < unroll_count; j++)
2913			if (dup_exec_ref[i * unroll_count + j] >= 0)
2914				dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2915	}
2916	// Reset all attr.
2917	memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2918	_ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2919	ccv_nnc_graph_visit_free(dup_visit);
2920	ccfreefree(dup_exec_symbol_info);
2921	ccfreefree(dup_exec_from_ref);
2922	ccfreefree(dup_tensor_from_ref);
2923	// Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2924	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2925		// Loop over all possible duplications to assign dup_p_ref properly.
2926		for (j = 0; j < unroll_count; j++)
2927		{
2928			const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2929			if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2930			{
2931				const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2932				const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2933				if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2934				{
2935					if (!tensor_blocks[dup_idx].dup_p_refs)
2936						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2937					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2938				}
2939				if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2940					continue;
2941				const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2942				const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2943				if (p_ref_1_is_in_or_out == 1)
2944				{
2945					if (!tensor_blocks[dup_idx].dup_p_refs)
2946						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2947					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2948				}
2949			}
2950		}
2951	// companion_ref
2952	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2953		// Now can assign them (The dup) as companion.
2954		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2955		{
2956			// Get to the last one, which we will wrap over.
2957			const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2958			if (assign_ref >= 0)
2959			{
2960				int b_ref = assign_ref;
2961				while (tensor_blocks[b_ref].ref)
2962					b_ref = tensor_blocks[b_ref].ref - 1;
2963				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2964				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2965				// It cannot be that both i can hop to j can j can hop to i.
2966				// And it can be hop from one to another now after duplication.
2967				assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
 ({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
 ("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2967, __extension__ __PRETTY_FUNCTION__); }));
2968				tensor_blocks[i].companion_ref = b_ref + 1;
2969				tensor_blocks[b_ref].companion_ref = i + 1;
2970			}
2971		}
2972	ccfreefree(dup_tensor_symbol_info);
2973	// Extend the dup tensor block ref, prepare for future extensions.
2974	dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2975	for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2976		dup_tensor_block_ref[i] = -1;
2977	// Assign out changed properties.
2978	*r_exec_dep = exec_dep;
2979	*r_tensor_blocks = tensor_blocks;
2980	*r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2981	*r_dup_graph = dup_graph;
2982	*r_unroll_count = unroll_count;
2983	*r_dup_exec_ref = dup_exec_ref;
2984	*r_dup_tensor_block_ref = dup_tensor_block_ref;
2985}
2986 
2987static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2988{
2989	if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
2990		return tensor_block_size;
2991	int i;
2992	const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
2993	int found_idx = tensor_block_size;
2994	for (i = 0; i < anonymous_block_free_list_cap; i++)
2995	{
2996		const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)));
2997		assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
 ({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 2997, __extension__ __PRETTY_FUNCTION__
); }));
2998		// If the type doesn't match, ignore.
2999		if (tensor_blocks[idx].type != type)
3000			continue;
3001		// Heuristic about how to select the best tensor block to move forward.
3002		// If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
3003		if (tensor_blocks[idx].size >= size)
3004		{
3005			if (no_dup_p_refs)
3006				return idx;
3007			// Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
3008			// then we cannot do better than this, if that is the case, just return.
3009			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
3010				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
3011				return idx;
3012		}
3013		int64_t found_idx_size_diff;
3014		int64_t idx_size_diff;
3015		if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
3016			// Now, compare whether this one or the found_idx one is better.
3017			// At this point, there is no point of comparing the dup_p_refs, we only care about which one
3018			// is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
3019			(found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3020		{
3021			found_idx = idx;
3022			continue;
3023		}
3024		// No need to update if found_idx is better than idx.
3025		if (found_idx_size_diff > idx_size_diff)
3026			continue;
3027		// We bias towards the bigger one in case of similar.
3028		if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3029		{
3030			found_idx = idx;
3031			continue;
3032		}
3033		assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
 == tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 3033, __extension__ __PRETTY_FUNCTION__
); }));
3034		// On a tie, check which one has tighter life-cycle.
3035		if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3036		{
3037			// Check whether the current tensor blocks life-cycle is longer than the previous one.
3038			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3039				(!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3040				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3041				found_idx = idx;
3042			continue;
3043		}
3044		// Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3045		// We prefer to choose the one that has life-cycle closer to the expected ones.
3046		if (no_dup_p_refs)
3047		{
3048			// Whoever is shorter wins.
3049			if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3050				(!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3051				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3052				found_idx = idx;
3053			continue;
3054		}
3055		if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3056			continue;
3057		if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3058		{
3059			found_idx = idx;
3060			continue;
3061		}
3062		// If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3063		const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3064		const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3065		if (idx_after_request && found_idx_after_request)
3066		{
3067			if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3068				found_idx = idx;
3069			continue;
3070		} else {
3071			// We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3072			// If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3073			// Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3074			if (!found_idx_after_request && (idx_after_request ||
3075				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3076				found_idx = idx;
3077			continue;
3078		}
3079	}
3080	return found_idx;
3081}
3082 
3083static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3084{
3085	if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3086		return 0;
3087	int i, j, k;
3088	int input_size = 0;
3089	for (i = 0; i < p_node_info->p_while.input_size; i++)
3090		if (p_node_info->p_while.inputs[i] >= 0)
3091			++input_size;
3092	// If doesn't have tensor inputs (thus, only special inputs), just return.
3093	if (!input_size)
3094		return 0;
3095	ccv_nnc_tensor_symbol_t inputs[input_size];
3096	input_size = 0;
3097	for (i = 0; i < p_node_info->p_while.input_size; i++)
3098		if (p_node_info->p_while.inputs[i] >= 0)
3099			inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3100				.d = p_node_info->p_while.inputs[i],
3101				.graph = symbolic_graph,
3102			};
3103	assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
 1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
 > 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3103, __extension__ __PRETTY_FUNCTION__
); }));
3104	ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3105	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3106	for (i = 0; i < symbolic_graph->breakpoint_size; i++)
3107	{
3108		// Make a noop copy of the breakpoint, but with some tensor inputs.
3109		ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
3110		ccv_array_push(dup_breakpoints, &noop);
3111		// Connect this noop to the outgoing nodes of breakpoints.
3112		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(symbolic_graph->breakpoints[i].d)));
3113		if (symbol_info->outgoings)
3114			for (j = 0; j < symbol_info->outgoings->rnum; j++)
3115			{
3116				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3117				ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3118					.d = d,
3119					.graph = symbolic_graph,
3120				});
3121			}
3122	}
3123	for (i = 0; i < exec_symbol_info_size; i++)
3124	{
3125		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
3126		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
3127			continue;
3128		if (symbol_info->outgoings)
3129		{
3130			const int outgoing_size = symbol_info->outgoings->rnum;
3131			for (j = 0; j < outgoing_size; j++)
3132			{
3133				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3134				for (k = 0; k < symbolic_graph->breakpoint_size; k++)
3135					if (d == symbolic_graph->breakpoints[k].d)
3136					{
3137						ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)));
3138						ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3139							.d = i,
3140							.graph = symbolic_graph,
3141						}, noop);
3142						// Found, connected, exit.
3143						break;
3144					}
3145			}
3146		}
3147	}
3148	// Add the dup_breakpoints to source if neccessary.
3149	assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3149, __extension__ __PRETTY_FUNCTION__
); }));
3150	const int source_size = symbolic_graph->sources->rnum;
3151	for (i = 0; i < source_size; i++)
3152	{
3153		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i))))->d;
3154		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3155			if (d == symbolic_graph->breakpoints[j].d)
3156			{
3157				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3158				ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3159				// Found, made, exit.
3160				break;
3161			}
3162	}
3163	// Add the dup_breakpoints to destination if neccessary.
3164	assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3164, __extension__ __PRETTY_FUNCTION__); }));
3165	const int destination_size = symbolic_graph->destinations->rnum;
3166	for (i = 0; i < destination_size; i++)
3167	{
3168		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i))))->d;
3169		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3170			if (d == symbolic_graph->breakpoints[j].d)
3171			{
3172				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3173				ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3174				// Found, made, exit.
3175				break;
3176			}
3177	}
3178	return dup_breakpoints;
3179}
3180 
3181// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3182static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3183{
3184	assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3184, __extension__ __PRETTY_FUNCTION__
); }));
3185	assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
 ({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3185, __extension__ __PRETTY_FUNCTION__
); }));
3186	// First, fill all the "auto" holes.
3187	// This is the symbol table that with "auto" info filled up.
3188	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3189	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3190	ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3191	ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3191, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3191, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3191, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
3192	ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3193	int i, j, k, p, q;
3194	const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
3195	ccv_sparse_matrix_t* exec_dep;
3196	ccv_nnc_tensor_block_t* tensor_blocks;
3197	_ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3198	int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3199	// Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3200	// are automatically filled in, and all the sub-graphs are processed.
3201	// There is a last step though, for a while loop, it is parameterized:
3202	// while (x > 5) {
3203	//     y = x + 1;
3204	// } (y => x) // This means after this loop is done, y's value will be copied over to x.
3205	// we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3206	// If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3207	// it is a inplace operation.
3208	// But if y cannot be x's alias, for example, this while loop looks like this:
3209	// while (x > 5) {
3210	//     y = x + a
3211	//     b = x + y
3212	// } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3213	// For this example, y cannot be x's alias because x is used later to compute b (and that computation
3214	// has dependency on y as well).
3215	// For this case, we need to modify the computation graph. Previously, the graph looks like this:
3216	// y = x + a -> b = x + y
3217	// This graph will be extended to look like this:
3218	// y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3219	// while (x0 > 5) {
3220	//     y0 = x0 + a0
3221	//     b0 = x0 + y0
3222	//     if (y0 > 5) break
3223	//     y1 = y0 + b0
3224	//     b1 = y0 + y1
3225	// } (y1 => x0, b1 => a0)
3226	// After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3227	// with each other now).
3228	// With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3229	// which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3230	ccv_nnc_symbolic_graph_t* dup_graph = 0;
3231	int* dup_exec_ref = 0;
3232	int* dup_tensor_block_ref = 0;
3233	int unroll_count = 0;
3234	// In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3235	ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
3236	prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3237	prep->flags = 0;
3238	// Cannot handle dup a node that is a graph as well.
3239	if (p_exec_symbol_info)
3240	{
3241		prep->flags = p_node_info->flags;
3242		if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3243		{
3244			_ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3245			_ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0))), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3246		} else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3247			// TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3248		}
3249	}
3250	ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
3251	ccv_array_t* anonymous_block_free_list = 0;
3252	const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3253	// Record whether this tensor is folded in this round.
3254	uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
3255	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
3256		for (p = 0; p < node->graph_ref_size; p++)
3257		{
3258			assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 3258, __extension__ __PRETTY_FUNCTION__); }));
3259			ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)));
3260			ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3261			ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0))), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0))), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3262			sub_prep->dup_breakpoints = dup_breakpoints;
3263			sub_prep->p = prep;
3264			sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1] = sub_prep;
3265			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3266			const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3267			for (i = 0; i < s_alloc_prep->block_size; i++)
3268			{
3269				const int block_ref = s_alloc_prep->blocks[i].block_ref;
3270				const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3271				if (block_ref < sub_prep->tensor_symbol_info_size)
3272				{
3273					// If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3274					// I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3275					if (s_tensor_blocks[block_ref].bypass_ref)
3276					{
3277						int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3278						while (s_tensor_blocks[bypass_ref].ref)
3279							bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3280						if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3281							s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3282							continue;
3283					}
3284					if (s_tensor_blocks[block_ref].p_refs[0])
3285					{
3286						/* If it is already properly assigned, next. */
3287						if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3288							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3289						{
3290							if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3291								s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3292							else {
3293								assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3293, __extension__ __PRETTY_FUNCTION__
); }));
3294								s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3295							}
3296						}
3297						/* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3298						if (s_tensor_blocks[block_ref].p_refs[1] &&
3299							s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3300							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3301						{
3302							assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3302, __extension__ __PRETTY_FUNCTION__
); }));
3303							assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3303, __extension__ __PRETTY_FUNCTION__
); }));
3304							s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3305						}
3306					}
3307				} else if (s_tensor_blocks[block_ref].dup_p_refs) {
3308					/* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3309					 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3310					 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3311					 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3312					 * its life-time to the end of the output tensor. */
3313					if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3314						s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3315					for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3316						ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j))));
3317				}
3318			}
3319		}
3320		const int init_tensor_block_size = tensor_block_size;
3321		int rw_anonymous_buffer_size_cap = 0;
3322		int ro_anonymous_buffer_size_cap = 0;
3323		if (anonymous_block_free_list)
3324			ccv_array_clear(anonymous_block_free_list);
3325		memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3326		for (p = 0; p < node->graph_ref_size; p++)
3327		{
3328			ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1];
3329			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3330			int rw_anonymous_buffer_size = 0;
3331			int ro_anonymous_buffer_size = 0;
3332			for (i = 0; i < s_alloc_prep->buffer_size; i++)
3333				if (s_alloc_prep->buffers[i].p_refs[0])
3334				{
3335					/* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3336					int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3337					/* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3338					int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3339					assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3339, __extension__ __PRETTY_FUNCTION__
); }));
3340					int unref_p_ref_0 = p_ref_0;
3341					while (tensor_blocks[unref_p_ref_0].ref)
3342						unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3343					/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3344					assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3344, __extension__ __PRETTY_FUNCTION__); }));
3345					if (s_alloc_prep->buffers[i].p_refs[1])
3346					{
3347						int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3348						const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3349						assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3349, __extension__ __PRETTY_FUNCTION__
); }));
3350						int unref_p_ref_1 = p_ref_1;
3351						while (tensor_blocks[unref_p_ref_1].ref)
3352							unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3353						/* See above comment for the similar p_ref_0 check. */
3354						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3354, __extension__ __PRETTY_FUNCTION__); }));
3355						assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3355, __extension__ __PRETTY_FUNCTION__
); }));
3356						int p_ref_t;
3357						if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3358						{
3359							CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
));
3360							CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t));
3361						}
3362						p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3363						/* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3364						if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
3365						{
3366							const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3367							if (folded)
3368							{
3369								p_ref_0 = p_ref_1;
3370								unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3371								tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3372								for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3373								{
3374									const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3375									assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3375, __extension__ __PRETTY_FUNCTION__
); }));
3376								}
3377							}
3378						}
3379					}
3380					/* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3381					 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3382					 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3383					 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3384					 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3385					 * associated with it, then we are good. */
3386					if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3387						(p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3388						(p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3389						TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3390					{
3391						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3392							{ assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3392, __extension__ __PRETTY_FUNCTION__
); })); }
3393						/* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3394						 * is a long argument why that is the case, the digest is, it is much easier to control your output
3395						 * than your input). */
3396						s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3397						s_alloc_prep->buffers[i].p_refs[1] = 0;
3398						/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3399						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3399, __extension__ __PRETTY_FUNCTION__); }));
3400						tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
 : _b; });
3401						for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3402							tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3403								tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3404									tensor_blocks[unref_p_ref_0].size;
3405					} else {
3406						s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3407						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3408							++ro_anonymous_buffer_size;
3409						else
3410							rw_anonymous_buffer_size += unroll_count + 1;
3411					}
3412				} else {
3413					if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3414						++ro_anonymous_buffer_size;
3415					else
3416						rw_anonymous_buffer_size += unroll_count + 1;
3417				}
3418			if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3419			{
3420				const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3421				// All read-write buffer (potentially) can be reused between each case..of branch.
3422				rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3423				// Read-only buffer cannot be reused between each case..of branch.
3424				ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3425				/* Anonymous block, allocate additional tensor blocks for this. */
3426				/* This is either because this is an internal tensor (don't have p_ref) */
3427				/* or it is an anonymous block itself within the sub graphs of this while graph. */
3428				tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3429				memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3430				if (dup_tensor_block_ref)
3431					dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3432				for (i = 0; i < s_alloc_prep->buffer_size; i++)
3433					if (!s_alloc_prep->buffers[i].p_refs[0])
3434					{
3435						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3436						{
3437							assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap"
, "ccv_nnc_symbolic_graph_compile.c", 3437, __extension__ __PRETTY_FUNCTION__
); }));
3438							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0x10) | ANONYMOUS));
3439							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3440							tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3441							tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3442							tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3443							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3444							tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3445							ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3446							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3447							if (dup_p_refs && dup_p_refs->rnum > 0)
3448							{
3449								for (j = 0; j < dup_p_refs->rnum; j++)
3450								{
3451									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3452									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3452, __extension__ __PRETTY_FUNCTION__
); }));
3453									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3453, __extension__ __PRETTY_FUNCTION__
); }));
3454									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3454, __extension__ __PRETTY_FUNCTION__); }));
3455									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3456									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3457									if (tensor_symbol_info[dup_p_ref].p_ref)
3458									{
3459										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3460										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3460, __extension__ __PRETTY_FUNCTION__); }));
3461										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3462										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3463										{
3464											if (!tensor_blocks[tensor_block_size].dup_p_refs)
3465												tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3466											ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3467										}
3468									}
3469									if (!tensor_blocks[tensor_block_size].tail)
3470										tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3471									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3472										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_size]);
3473								}
3474							} else {
3475								tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3476								ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3477							}
3478							for (j = 0; j < source_size; j++)
3479								_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3480							/* If this is a read-only (based on SSA, if first encountered as read), and this is
3481							 * sub-graph. Mark it to the end of the graph. */
3482							if (p_exec_symbol_info)
3483								for (j = 0; j < destination_size; j++)
3484									_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3485							/* If it is read-only, it is self-reflecting. */
3486							for (k = 0; k < unroll_count; k++)
3487							{
3488								for (j = 0; j < destination_size; j++)
3489									if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3490									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3491								/* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3492								assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
 ({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3492, __extension__ __PRETTY_FUNCTION__
); }));
3493								dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3494							}
3495							++tensor_block_size;
3496						} else {
3497							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3498							const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3499							const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3500							// Find suitable tensor block from the free list.
3501							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3502							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3503							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3504							if (new_anonymous_tensor_block)
3505							{
3506								tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3507								tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3508								tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3509								tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3510								ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3511							} else {
3512								tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3513								tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3514							}
3515							if (dup_p_refs && dup_p_refs->rnum > 0)
3516							{
3517								for (j = 0; j < dup_p_refs->rnum; j++)
3518								{
3519									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3520									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3520, __extension__ __PRETTY_FUNCTION__
); }));
3521									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3521, __extension__ __PRETTY_FUNCTION__
); }));
3522									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3523									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3524									if (tensor_symbol_info[dup_p_ref].p_ref)
3525									{
3526										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3527										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3527, __extension__ __PRETTY_FUNCTION__); }));
3528										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3529										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3530										{
3531											if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3532												tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3533											ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3534										}
3535									}
3536									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3536, __extension__ __PRETTY_FUNCTION__); }));
3537									if (!tensor_blocks[tensor_block_idx].tail)
3538										tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3539									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3540										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_idx]);
3541									// We have to add it to the warp around companion_ref as well.
3542									// TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3543									// be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3544									// definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3545									// gaurantee may be broken down in the line.
3546									if (tensor_blocks[dup_p_ref].companion_ref)
3547									{
3548										const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3549										for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3550											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3551										for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3552											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3553									}
3554								}
3555							} else if (new_anonymous_tensor_block) {
3556								tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3557								ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3558							}
3559							const int prev_tensor_block_idx = tensor_block_idx;
3560							if (new_anonymous_tensor_block)
3561							{
3562								if (!anonymous_block_free_list)
3563									anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3564								ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3565								++tensor_block_size;
3566							}
3567							for (k = 0; k < unroll_count; k++)
3568							{
3569								const int tensor_block_idx = new_anonymous_tensor_block ?
3570									(dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3571									dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3572								TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3573								TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3574								if (new_anonymous_tensor_block)
3575								{
3576									tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3577									tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3578									tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3579									tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3580									/* Attach to duplicated exec for this tensor block. */
3581									ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3582								} else {
3583									tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3584									tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3585									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3586 
3587								}
3588								if (dup_p_refs && dup_p_refs->rnum > 0)
3589								{
3590									/* Not nil, not self-reflecting. */
3591									for (j = 0; j < dup_p_refs->rnum; j++)
3592									{
3593										const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3594										assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3594, __extension__ __PRETTY_FUNCTION__
); }));
3595										assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3595, __extension__ __PRETTY_FUNCTION__
); }));
3596										// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3597										// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3598										if (tensor_symbol_info[dup_p_ref].p_ref)
3599										{
3600											const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3601											assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3601, __extension__ __PRETTY_FUNCTION__); }));
3602											const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3603											if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3604											{
3605												if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3606													tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3607												ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3608											}
3609										}
3610										assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
 ("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3610, __extension__ __PRETTY_FUNCTION__
); }));
3611										const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3612										assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
 __extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
 __assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3612, __extension__ __PRETTY_FUNCTION__); }));
3613										if (!tensor_blocks[tensor_block_idx].tail)
3614											tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3615										for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3616											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3617										// We have to add it to the warp around companion_ref as well.
3618										if (tensor_blocks[dup_dup_p_ref].companion_ref)
3619										{
3620											const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3621											for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3622												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3623											for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3624												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3625										}
3626									}
3627								} else if (new_anonymous_tensor_block) {
3628									tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3629									ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3630								}
3631								if (new_anonymous_tensor_block)
3632									++tensor_block_size;
3633							}
3634						}
3635					}
3636			}
3637		}
3638	} ccv_nnc_graph_visit_endfor} }
3639	if (anonymous_block_free_list)
3640		ccv_array_free(anonymous_block_free_list);
3641	ccfreefree(tensor_fold);
3642	// It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3643	// the allocation dependencies, thus, which tensor is reused to the existing tensor.
3644	ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3645	prep->while_count_tensor = 0;
3646	prep->dup_breakpoints = 0;
3647	prep->p = 0;
3648	prep->symbolic_graph = symbolic_graph;
3649	prep->p_idx = symbolic_graph->p_idx;
3650	prep->exec_idx = symbolic_graph->exec_idx;
3651	prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3652	prep->sub_preps = sub_preps;
3653	prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3654	prep->exec_symbol_info = exec_symbol_info;
3655	prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3656	prep->tensor_symbol_info = tensor_symbol_info;
3657	prep->unroll_count = unroll_count;
3658	prep->dup_tensor_block_ref = dup_tensor_block_ref;
3659	prep->tensor_block_size = tensor_block_size;
3660	prep->tensor_blocks = tensor_blocks;
3661	prep->exec_flags = exec_flags;
3662	prep->visit = visit;
3663	prep->alloc_prep = alloc_prep;
3664	if (dup_graph)
3665		ccv_nnc_symbolic_graph_free(dup_graph);
3666	if (dup_exec_ref)
3667		ccfreefree(dup_exec_ref);
3668	return prep;
3669}
3670 
3671static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3672{
3673	int i;
3674	_ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3675	ccfreefree(prep->exec_flags);
3676	for (i = 0; i < prep->sub_prep_size; i++)
3677		if (prep->sub_preps[i])
3678			_ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3679	if (prep->sub_preps)
3680		ccfreefree(prep->sub_preps);
3681	ccfreefree(prep->tensor_symbol_info);
3682	ccfreefree(prep->exec_symbol_info);
3683	if (prep->dup_tensor_block_ref)
3684		ccfreefree(prep->dup_tensor_block_ref);
3685	_ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3686	ccv_nnc_graph_visit_free(prep->visit);
3687	ccfreefree(prep);
3688}
3689 
3690static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3691{
3692	int i, j;
3693	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
3694		if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3695		{
3696			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3697			assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3697, __extension__ __PRETTY_FUNCTION__
); }));
3698			ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3699			for (i = 0; i < node->p_while.input_size; i++)
3700				if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3701				{
3702					ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3703					const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3704					for (j = 0; j < d; j++)
3705						prep = prep->p;
3706					prep->while_count_tensor = 1;
3707				}
3708		}
3709		for (i = 0; i < node->graph_ref_size; i++)
3710		{
3711			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3712			if (graph_ref >= 0)
3713				_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3714		}
3715	} ccv_nnc_graph_visit_endfor} }
3716}
3717 
3718static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3719{
3720	if (symbol >= 0)
3721		return graph_prep->tensor_arena->vt_tensors[symbol];
3722	if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3723		return 0;
3724	assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
 : 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
 0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3724, __extension__ __PRETTY_FUNCTION__
); }));
3725	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3726	int i;
3727	const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3728	for (i = 0; i < d; i++)
3729		prep = prep->p;
3730	assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
 ({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3730, __extension__ __PRETTY_FUNCTION__
); }));
3731	return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3732}
3733 
3734static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3735{
3736	int i;
3737	int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3738	ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3739	graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3740	graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3741	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3742	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3743		if (graph_execs[i].graph == graph)
3744			graph_execs[i].d = exec_cvt[graph_execs[i].d];
3745	ccfreefree(exec_cvt);
3746}
3747 
3748static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3749{
3750	int i, j, k;
3751	ccv_nnc_graph_t* const graph = graph_prep->graph;
3752	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3753	ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
1
Uninitialized value stored to field 'graph'→
3754	graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3755	graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3756	graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3757	graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3758	memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3759	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3760	int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3761	for (i = 0; i < exec_symbol_info_size; i++)
2
←
Assuming 'i' is >= 'exec_symbol_info_size'→
3
←
Loop condition is false. Execution continues on line 3770→
3762	{
3763		max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; });
3764		max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; });
3765		if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3766			max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
 : _b; });
3767		graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3768		graph_execs[i].graph = 0;
3769	}
3770	for (i = 0; i < graph_prep->sub_prep_size; i++)
4
←
Assuming 'i' is >= field 'sub_prep_size'→
5
←
Loop condition is false. Execution continues on line 3772→
3771		max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
 ((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; });
3772	ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
6
←
'?' condition is true→
3773	ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
7
←
'?' condition is true→
3774	ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })];
8
←
'?' condition is true→
3775	const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3776	const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3777	// Create node, this is in topological order.
3778	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
9
←
Assuming '_i_' is < field 'size'→
10
←
Loop condition is true.  Entering loop body→
3779		if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
11
←
The left operand of '==' is a garbage value
3780		{
3781			for (i = 0; i < node->input_size; i++)
3782				max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3783			for (i = 0; i < node->output_size; i++)
3784				max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3785			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3786			{
3787				const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3788				assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3788, __extension__ __PRETTY_FUNCTION__
); }));
3789				ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3790				ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3791				graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3792				const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3793				ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3794				ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3795				for (i = 0; i < node->p_while.input_size; i++)
3796					max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3797				for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3798					max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3799				ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3800				_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3801			} else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3802				for (i = 0; i < node->output_size; i++)
3803					if (max_outputs[i] && max_outputs[i]->alias_ref)
3804						max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3805				graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3806				// Check whether this is already covered in the inputs, if not, need to be covered in the update.
3807				for (i = 0; i < node->case_of.argument.offset; i++)
3808				{
3809					ccv_nnc_tensor_t* const update = max_inputs[i];
3810					if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3811						continue;
3812					int flag = 0;
3813					for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3814						flag = (update == max_inputs[j]);
3815					if (!flag)
3816						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3817				}
3818				const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3819				ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3820				if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3821				{
3822					// Add another graph for data transfer.
3823					ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3824					for (i = 0; i < node->output_size; i++)
3825						max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3826					ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }), max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }));
3827					ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3828					ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3829					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3830					int exec_cvt;
3831					ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3832				}
3833				for (i = 0; i < node->graph_ref_size; i++)
3834				{
3835					const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3836					if (graph_ref < 0)
3837						continue;
3838					ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3839					const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3840					ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3841					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3842					_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3843				}
3844			} else {
3845				graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3846			}
3847			ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3848		}
3849	} ccv_nnc_graph_visit_endfor} }
3850	// Then connect them.
3851	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
3852		if (node->outgoings)
3853			for (i = 0; i < node->outgoings->rnum; i++)
3854			{
3855				const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
3856				if (graph_execs[outgoing].graph)
3857					ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3858			}
3859	} ccv_nnc_graph_visit_endfor} }
3860	int source_exec_created = 0;
3861	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3862	const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3863	ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3864	// After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3865	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
3866	{
3867		if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
3868		{
3869			int ref = i;
3870			while (tensor_symbol_info[ref].alias_ref)
3871				ref = tensor_symbol_info[ref].alias_ref - 1;
3872			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
3873				ref = tensor_blocks[ref].ref - 1;
3874			// This is not computable. It could be that we marked a const tensor as init zero.
3875			if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)))
3876				continue;
3877			// If this tensor is not used by any exec, we don't need to init at all. Skip.
3878			if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3879				continue;
3880			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3881			// Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3882			ccv_nnc_graph_exec_t set_exec;
3883			if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3884				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={0,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3885			else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3886				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={1,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3887			for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
3888			{
3889				const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)));
3890				if (outgoing >= exec_symbol_info_size)
3891					continue;
3892				assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
 if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3892, __extension__ __PRETTY_FUNCTION__
); }));
3893				assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3893, __extension__ __PRETTY_FUNCTION__
); }));
3894				ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3895			}
3896			int flags = 0;
3897			if (alloc_dep[ref])
3898				for (j = 0; j < alloc_dep[ref]->rnum; j++)
3899				{
3900					const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)));
3901					// This is from alloc_dep, it should be computable.
3902					assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3902, __extension__ __PRETTY_FUNCTION__
); }));
3903					if (tensor_blocks[d].tail)
3904						for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
3905						{
3906							const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
3907							if (incoming >= exec_symbol_info_size)
3908								continue;
3909							assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
 if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3909, __extension__ __PRETTY_FUNCTION__
); }));
3910							assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3910, __extension__ __PRETTY_FUNCTION__
); }));
3911							ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3912							flags = 1;
3913						}
3914				}
3915			// If cannot find a start node for this exec, we need to append it to the no-op of the start.
3916			if (!flags)
3917			{
3918				if (!source_exec_created)
3919				{
3920					graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3921					source_exec_created = 1;
3922				}
3923				ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3924			}
3925		}
3926	}
3927	// Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3928	// (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3929	// with its alias).
3930	assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3930, __extension__ __PRETTY_FUNCTION__
); }));
3931	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3932	{
3933		ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3934		// If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3935		if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3936		{
3937			const ccv_array_t* const head = tensor_blocks[i].head;
3938			if (head && head->rnum > 0)
3939				for (j = 0; j < head->rnum; j++)
3940				{
3941					const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(j)));
3942					if (idx >= exec_symbol_info_size)
3943						continue;
3944					assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3944, __extension__ __PRETTY_FUNCTION__); }));
3945					const int d = graph_execs[idx].d;
3946					ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)));
3947					int flag = 0;
3948					if (exec_info->tensor_wraps_ref)
3949					{
3950						ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)));
3951						for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3952							flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3953					}
3954					// If none is in the flag, it need to be included in the cast.
3955					if (!flag)
3956						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3957				}
3958		}
3959	}
3960	// Create source / destination phony node. This is to facilitate use of compiled graph.
3961	// Also, this is needed if you have init zero execs.
3962	if (source_exec_created || source_size > 1)
3963	{
3964		if (!source_exec_created)
3965			graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3966		for (i = 0; i < source_size; i++)
3967			ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3968	} else {
3969		assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
 ({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3969, __extension__ __PRETTY_FUNCTION__
); }));
3970		assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
 if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3970, __extension__ __PRETTY_FUNCTION__
); }));
3971		graph_exec_arena->source = graph_execs[sources[0].d];
3972	}
3973	if (destination_size == 1)
3974		graph_exec_arena->destination = graph_execs[destinations[0].d];
3975	else {
3976		graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3977		for (i = 0; i < destination_size; i++)
3978			ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3979	}
3980	ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3981	ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3982	return graph_exec_arena;
3983}
3984 
3985static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3986{
3987	if (graph_prep->symbolic_graph == pair)
3988		return graph_prep->graph;
3989	int i;
3990	for (i = 0; i < graph_prep->sub_prep_size; i++)
3991		if (graph_prep->sub_preps[i])
3992		{
3993			ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
3994			if (graph)
3995				return graph;
3996		}
3997	return 0;
3998}
3999 
4000static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4001{
4002	int i;
4003	for (i = 0; i < graph_prep->sub_prep_size; i++)
4004		if (graph_prep->sub_preps[i])
4005		{
4006			if (graph_prep->sub_preps[i]->symbolic_graph->pair)
4007				graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
4008		}
4009}
4010 
4011static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4012{
4013	assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4013, __extension__ __PRETTY_FUNCTION__
); }));
4014	int i;
4015	for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
4016	{
4017		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
))
4018			continue;
4019		if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref)
4020		{
4021			ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4022				.d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4023				.graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph,
4024			});
4025			if (pair_exec.d >= 0)
4026				ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4027		}
4028	}
4029	for (i = 0; i < graph_prep->sub_prep_size; i++)
4030		if (graph_prep->sub_preps[i])
4031			_ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4032}
4033 
4034static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4035{
4036	int i;
4037	if (graph_prep->dup_breakpoints)
4038	{
4039		// Strip the const modifier only possible because it is a sub-graph.
4040		ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4041		for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
4042			ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
 + (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i))));
4043		ccv_array_free(graph_prep->dup_breakpoints);
4044		graph_prep->dup_breakpoints = 0;
4045		graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4046		// Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4047		memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4048		// Since exec_symbol_info changed, create a new visit object.
4049		assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 4049, __extension__ __PRETTY_FUNCTION__
); }));
4050		assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 4050, __extension__ __PRETTY_FUNCTION__); }));
4051		ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)));
4052		const int source_size = symbolic_graph->sources->rnum;
4053		ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)));
4054		const int destination_size = symbolic_graph->destinations->rnum;
4055		ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4055, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4055, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4055, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
4056		ccv_nnc_graph_visit_free(graph_prep->visit);
4057		graph_prep->visit = visit;
4058		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 4058, __extension__ __PRETTY_FUNCTION__
); }));
4059		ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4060	}
4061	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
4062		for (i = 0; i < node->graph_ref_size; i++)
4063		{
4064			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
4065			if (graph_ref >= 0)
4066				_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4067		}
4068	} ccv_nnc_graph_visit_endfor} }
4069}
4070 
4071const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4072 
4073void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4074{
4075	assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 4075, __extension__ __PRETTY_FUNCTION__); }));
4076	assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
 if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4076, __extension__ __PRETTY_FUNCTION__
); }));
4077	assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
 ({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4077, __extension__ __PRETTY_FUNCTION__
); }));
4078	int i;
4079	// Cannot bind the multi-view.
4080	for (i = 0; i < tensor_bind_size; i++)
4081	{
4082		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 4082, __extension__ __PRETTY_FUNCTION__
); }));
4083		assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 4083, __extension__ __PRETTY_FUNCTION__
); }));
4084	}
4085	ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4086	_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4087	ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4088	_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4089	*tensor_arena_ref = tensor_arena;
4090	// The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4091	_ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4092	// Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4093	_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4094	*graph_ref = graph_prep->graph;
4095	ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4096	_ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4097	_ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4098	*graph_exec_arena_ref = graph_exec_arena;
4099	_ccv_nnc_symbolic_graph_prep_free(graph_prep);
4100}
4101 
4102static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4103{
4104	// Buffers are inherited from above, no need to dealloc.
4105	int i;
4106	for (i = 0; i < tensor_arena->sub_arena_size; i++)
4107		if (tensor_arena->sub_arenas[i])
4108			_ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4109	for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
4110	{
4111		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
 (size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i))));
4112		assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 4112, __extension__ __PRETTY_FUNCTION__
); }));
4113		ccv_nnc_tensor_multiview_free(*mv);
4114	}
4115	ccv_array_free(tensor_arena->tensor_metadata);
4116	ccv_array_free(tensor_arena->m_tensor_idx);
4117	if (tensor_arena->pb_vt_tensors)
4118		ccfreefree(tensor_arena->pb_vt_tensors);
4119	if (tensor_arena->vt_alias_r_refs_p)
4120		ccfreefree(tensor_arena->vt_alias_r_refs_p);
4121	if (tensor_arena->vt_sizes)
4122		ccfreefree(tensor_arena->vt_sizes);
4123	ccfreefree(tensor_arena);
4124}
4125 
4126void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4127{
4128	assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
 == (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 4128, __extension__ __PRETTY_FUNCTION__
); }));
4129	assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4129, __extension__ __PRETTY_FUNCTION__
); }));
4130	assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({
 if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 4130, __extension__ __PRETTY_FUNCTION__
); }));
4131	// Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4132	int i;
4133	if (!tensor_arena->pb_vt_tensors)
4134	{
4135		tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4136		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4137			if (tensor_arena->vt_tensors[i])
4138				tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4139	}
4140	if (!tensor_arena->vt_alias_r_refs_p)
4141	{
4142		tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4143		tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4144		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4145			if (tensor_arena->vt_alias_refs[i])
4146			{
4147				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4148				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4148, __extension__ __PRETTY_FUNCTION__
); }));
4149				++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4150			}
4151		int refp = 0;
4152		for (i = 0; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4153			if (tensor_arena->vt_alias_r_refs_p[i])
4154				refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4155			else
4156				tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4157		for (i = refp; i < tensor_arena->vt_tensor_size; i++)
4158			tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4159		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4160			if (tensor_arena->vt_alias_refs[i])
4161			{
4162				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4163				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4163, __extension__ __PRETTY_FUNCTION__
); }));
4164				const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4165				assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if (
pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 4165, __extension__ __PRETTY_FUNCTION__); }));
4166				tensor_arena->vt_alias_r_refs[pos] = i;
4167			}
4168	}
4169	const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d;
4170	if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4171	{
4172		assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor
)->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4172, __extension__ __PRETTY_FUNCTION__
); })); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4173		assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4175, __extension__ __PRETTY_FUNCTION__
); }))
4174					ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4175, __extension__ __PRETTY_FUNCTION__
); }))
4175				(size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4175, __extension__ __PRETTY_FUNCTION__
); }));
4176	} else
4177		{ assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4177, __extension__ __PRETTY_FUNCTION__
); })); }
4178	if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW
))
4179		{ assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors
[symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t
*)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else
 __assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4179, __extension__ __PRETTY_FUNCTION__
); })); }
4180	tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4181	if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4182		for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++)
4183		{
4184			const int d = tensor_arena->vt_alias_r_refs[i];
4185			if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it.
4186				break;
4187			ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4188			d_tensor->info.datatype = tensor->info.datatype;
4189			d_tensor->info.reserved = tensor->info.reserved;
4190			if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW))
4191				ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4192			else {
4193				d_tensor->data.u8 = tensor->data.u8;
4194				d_tensor->dataof = tensor->dataof;
4195			}
4196		}
4197}
4198 
4199void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4200{
4201	if (!tensor_arena->pb_vt_tensors)
4202		return;
4203	int i;
4204	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4205		if (tensor_arena->vt_tensors[i])
4206			tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4207}
4208 
4209uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4210{
4211	uint64_t total_size = 0;
4212	int i;
4213	for (i = 0; i < tensor_arena->buffer_size; i++)
4214		total_size += tensor_arena->buffers[i].size;
4215	return total_size;
4216}
4217 
4218static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4219{
4220	int i;
4221	if (mv->it)
4222		mv->it->info = params;
4223	for (i = 0; i < mv->repeat + mv->kind; i++)
4224	{
4225		ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
4226		if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4227			_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4228		else
4229			tensor->info = params;
4230	}
4231}
4232 
4233int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4234{
4235	int i;
4236	assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph->
tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size
) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4236, __extension__ __PRETTY_FUNCTION__
); }));
4237	if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4238	{
4239		tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4240		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4241			if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4242			{
4243				ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4244				if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4245				{
4246					ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4247					while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
4248						mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
4249					tensor = (ccv_nnc_tensor_t*)mv;
4250				}
4251				tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4252			}
4253	}
4254	int flag = 0;
4255	for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++)
4256		if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4257		{
4258			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4259			ccv_nnc_tensor_param_t params = symbol_info->info;
4260			params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4261			params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4262			flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4263		}
4264	if (flag)
4265		return -1;
4266	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4267		if (tensor_arena->vt_tensors[i])
4268		{
4269			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4270			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4271			if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4272			{
4273				assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0)
, __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else
 __assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c"
, 4273, __extension__ __PRETTY_FUNCTION__); }));
4274				_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4275			} else if (!tensor_arena->vt_alias_refs[i]) {
4276				ccv_nnc_tensor_param_t params = symbol_info->info;
4277				params.datatype = tensor->info.datatype;
4278				params.reserved = tensor->info.reserved;
4279				tensor->info = params;
4280			} else {
4281				off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4282				ccv_nnc_tensor_param_t params = symbol_info->info;
4283				params.datatype = tensor->info.datatype;
4284				params.reserved = tensor->info.reserved;
4285				tensor->info = params;
4286				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4287				ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4288				if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4289				{
4290					((ccv_nnc_tensor_view_t*)tensor)->off = off;
4291					memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4292				}
4293			}
4294		}
4295	// Should handle sub_tensor_arena, don't do that at the moment.
4296	assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs"
, "ccv_nnc_symbolic_graph_compile.c", 4296, __extension__ __PRETTY_FUNCTION__
); }));
4297	return 0;
4298}
4299 
4300void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4301{
4302	assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum
 >= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__
 ({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena
->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 4302, __extension__ __PRETTY_FUNCTION__
); }));
4303	int i;
4304	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
4305	{
4306		const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4307		if (graph_exec.d < 0)
4308			continue;
4309		const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4310		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
4311		ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4312		if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4313		{
4314			new_cmd.backend = existing_cmd.backend;
4315			new_cmd.algorithm = existing_cmd.algorithm;
4316		}
4317		ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4318	}
4319}
4320 
4321void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4322{
4323	int i;
4324	for (i = 0; i < tensor_arena->buffer_size; i++)
4325	{
4326		if (!tensor_arena->buffers[i].ptr)
4327			continue;
4328		const int buffer_type = tensor_arena->buffers[i].type;;
4329		const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
4330#ifdef HAVE_CUDA1
4331		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4332		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4333		{
4334			if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4335				tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4336			else
4337				cufree(device_id, tensor_arena->buffers[i].ptr);
4338		} else {
4339			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4339, __extension__ __PRETTY_FUNCTION__
); }));
4340			if (tensor_arena->buffers[i].pin_mem)
4341				cuhostfree(tensor_arena->buffers[i].ptr);
4342			else
4343				ccfreefree(tensor_arena->buffers[i].ptr);
4344		}
4345#elif defined(HAVE_MPS)
4346		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4347		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4348		{
4349			// if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4350			// 	tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4351			// else
4352			mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4353		} else {
4354			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4354, __extension__ __PRETTY_FUNCTION__
); }));
4355			ccfreefree(tensor_arena->buffers[i].ptr);
4356		}
4357#else
4358		assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4358, __extension__ __PRETTY_FUNCTION__
); }));
4359		ccfreefree(tensor_arena->buffers[i].ptr);
4360#endif
4361		tensor_arena->buffers[i].ptr = 0;
4362	}
4363	// For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4364	if (tensor_arena->disposers)
4365	{
4366		for (i = 0; i < tensor_arena->disposers->rnum; i++)
4367		{
4368			ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + (
size_t)(tensor_arena->disposers)->rsize * (size_t)(i)));
4369			disposer->dispose(disposer->ptr, disposer->userdata);
4370		}
4371		ccv_array_free(tensor_arena->disposers);
4372		tensor_arena->disposers = 0;
4373	}
4374}
4375 
4376void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4377{
4378	ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4379	_ccv_nnc_tensor_arena_free(tensor_arena);
4380}
4381 
4382void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4383{
4384	int i;
4385	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
4386		if (graph_exec_arena->sub_arenas[i])
4387			ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4388	ccfreefree(graph_exec_arena);
4389}