ccv_nnc_symbolic_graph

Bug Summary

File:	nnc/ccv_nnc_symbolic_graph_compile.c
Warning:	line 4184, column 5 Access to field 'info' results in a dereference of a null pointer
Annotated Source Code

Press '?' to see keyboard shortcuts
Show analyzer invocation
clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2026-05-20-214901-2842894-1 -x c ccv_nnc_symbolic_graph_compile.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "ccv_nnc_easy.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include "_ccv_nnc_graph.h"
11#include "_ccv_nnc_symbolic_graph.h"
12 
13// MARK - Level-3 API
14 
15typedef struct {
16	int flags;
17	int type;
18	int pin_mem; // This memory need to be pinned.
19	int ref; // Reference to another tensor block. Start with 1.
20	int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21	int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22	int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23	int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24	ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25	uint64_t size; // The size of the tensor expected.
26	int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27	ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28	ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29	ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30} ccv_nnc_tensor_block_t; // Tensor Arena Block
31 
32#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
33 
34enum {
35	UNASSIGNED = 0x1,
36	ALIAS = 0x2,
37	READ_ONLY = 0x4,
38	WRITE_ONLY = 0x8,
39	READ_WRITE = 0xc,
40	ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41	UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42	UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43};
44 
45#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
46#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
47#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
48#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
50#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
51#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED)) (!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
52#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
53#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
54#define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
56#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
58#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
60 
61#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
 & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
62 
63// Holds additional information about the exe nodes.
64typedef struct {
65	int flags;
66} ccv_nnc_graph_exec_flag_t;
67 
68enum {
69	CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70};
71 
72typedef struct {
73	int index;
74	int oc;
75	int type;
76	uint64_t size;
77} ccv_nnc_tensor_opt_t;
78 
79// We first sort the same type together (because they won't be reused at all.
80// And then we sort by size, after that, sort by oc.
81#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
 *array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
 t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
83#undef more_than
84typedef struct {
85	int idx;
86	int hop;
87} ccv_nnc_tensor_hop_t;
88#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t
 total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t
 t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t
* left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t
* ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && less_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t
* left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0
; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t
* a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux
) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b
, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? (
less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a)
) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = less_than
(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, *
c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than
(*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !less_than(*pivot, *left
, aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !less_than(*right, *pivot, aux) ) { if( !less_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
90#undef less_than
91 
92// If b has items overlap with a, a is still after b (inclusive).
93static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94{
95	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__
); }));
96	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__
); }));
97	int x, y;
98	for (x = 0; x < b->rnum; x++)
99	{
100		const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)));
101		int flag = 0;
102		// In extreme cases where a is a superset of b, then a is still after b, we are good.
103		for (y = 0; !flag && y < a->rnum; y++)
104		{
105			const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)));
106			flag = (p == q);
107		}
108		if (!flag)
109			for (y = 0; y < a->rnum; y++)
110			{
111				ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y))), p);
112				if (!cell.i32 || cell.i32[0] == 0)
113					return 0;
114			}
115	}
116	// If b->rnum == 0, a is after b for sure.
117	// Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118	// if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119	return (a->rnum > 0 || b->rnum == 0);
120}
121 
122static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123{
124	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__
 __PRETTY_FUNCTION__); }));
125	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__
 __PRETTY_FUNCTION__); }));
126	if (!a->rnum || !b->rnum)
127		return 0;
128	int x, y, max_hop = 0;
129	for (x = 0; x < a->rnum; x++)
130	{
131		ccv_sparse_matrix_vector_t* const vector = ccv_get_sparse_matrix_vector(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x))));
132		if (!vector)
133			return 0;
134		for (y = 0; y < b->rnum; y++)
135		{
136			const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y))));
137			if (!cell.i32 || cell.i32[0] == 0)
138				return 0;
139			if (cell.i32[0] > max_hop)
140				max_hop = cell.i32[0];
141		}
142	}
143	// We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
144	// The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
145	return max_hop;
146}
147 
148// If every a's head is deterministically after b's tail
149static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
150{
151	return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
152}
153 
154typedef struct {
155	ccv_array_t** alloc_dep;
156	int vt_block_size;
157	int buffer_size;
158	int block_size;
159	int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
160	struct {
161		int type; // The type from tensor blocks.
162		int pin_mem; // Whether this is pinned memory.
163		int flags; // The flags (currently for READ_ONLY or not).
164		uint64_t size; // The size of the buffer allocated.
165		int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
166		ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
167	}* buffers;
168	struct {
169		int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
170		int block_ref; // A reference to which block in the given tensor_block to use.
171		uint64_t offset; // The offset of this block.
172	}* blocks;
173} ccv_nnc_tensor_alloc_prep_t;
174 
175typedef struct ccv_nnc_symbolic_graph_prep_s {
176	int flags;
177	int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
178	int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
179	int exec_idx;
180	int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
181	int tensor_symbol_info_size;
182	int exec_symbol_info_size;
183	int tensor_block_size;
184	int sub_prep_size;
185	ccv_nnc_tensor_block_t* tensor_blocks;
186	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
187	ccv_nnc_graph_exec_flag_t* exec_flags;
188	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
189	int* dup_tensor_block_ref;
190	ccv_nnc_graph_visit_t* visit;
191	ccv_nnc_tensor_alloc_prep_t* alloc_prep;
192	struct ccv_nnc_symbolic_graph_prep_s* p;
193	struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
194	// Structures that don't require to be freed after deallocation.
195	const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
196	ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
197	ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
198	ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
199} ccv_nnc_symbolic_graph_prep_t;
200 
201typedef struct {
202	int oc;
203	ccv_array_t* itf;
204} ccv_nnc_tensor_block_adjacent_t;
205 
206static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
207{
208	// Compute how many dis-continuous buffers are needed.
209	// We prefer to have several dis-continuous buffers instead of one big buffer because
210	// in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
211	// to fully utilize memory.
212	int i, j, k;
213	ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
214	int allocable_tensor_size = 0, available_tensor_size = 0;
215	for (i = 0; i < tensor_block_size; i++)
216		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
217		{
218			// Tensors that we need the header info.
219			++available_tensor_size;
220			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
221				// Tensors that we actually need to allocate (exclude the alias).
222				++allocable_tensor_size;
223		}
224	ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
225	ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
226	ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
227	// Overlap count.
228	for (i = 0; i < tensor_block_size; i++)
229		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
230			for (j = i + 1; j < tensor_block_size; j++)
231				if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED)))
232				{
233					// We only reuse buffers within the same memory type. The tensor_dt / tensor_df
234					// matrices are only queried later for same-type candidates in this function,
235					// thus cross-type hop relations are not needed for allocation planning here.
236					if (tensor_blocks[i].type != tensor_blocks[j].type)
237						continue;
238					// Check to see if they interfere (default to yes).
239					// If any of the i's head is deterministically later than j's tail
240					// or any of the i's tail is deterministically earlier than j's head, they don't interfere.
241					const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
242					int j_hop_i = 0;
243					if (i_hop_j > 0)
244					{
245						ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
246						ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
247					} else {
248						// It cannot be that both directions are positive. If i can hop to j, we don't
249						// need the reverse hop value for any subsequent allocation decision.
250						j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
251						if (j_hop_i > 0)
252						{
253							ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
254							ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
255						}
256					}
257					if (!i_hop_j && !j_hop_i)
258					{
259						if (!adj[i].itf)
260							adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
261						ccv_array_push(adj[i].itf, &j);
262						++adj[i].oc;
263						if (!adj[j].itf)
264							adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
265						ccv_array_push(adj[j].itf, &i);
266						++adj[j].oc;
267					}
268				}
269	const int exec_dep_rows = exec_dep->rows;
270	ccv_matrix_free(exec_dep);
271	ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
272	int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
273	uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
274	uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
275	uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
276	int num_assigned = 0; 
277	// I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
278	// Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
279	// The first channel denotes the bytes available for allocation,
280	// the second channel denotes the offset available for the allocation,
281	ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
282	ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
283	for (j = 0; j < allocable_tensor_size;)
284	{
285		// Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
286		uint64_t max_size = 0;
287		ccv_array_clear(opt);
288		int current_type = 0; // Deal with one type at a time.
289		for (i = 0; i < tensor_block_size; i++)
290			if (tensor_blocks[i].size >= max_size &&
291				TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && !assigned[i] &&
292				IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
293				(!current_type || tensor_blocks[i].type == current_type))
294			{
295				ccv_nnc_tensor_opt_t a = {
296					.size = tensor_blocks[i].size,
297					.index = i,
298					.oc = adj[i].oc,
299					.type = tensor_blocks[i].type,
300				};
301				assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 301, __extension__ __PRETTY_FUNCTION__); }));
302				current_type = a.type; // Now we now the primary type we should deal with.
303				if (tensor_blocks[i].companion_ref)
304				{
305					const int companion_ref = tensor_blocks[i].companion_ref - 1;
306					a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; });
307					a.oc += adj[companion_ref].oc;
308				}
309				// In case we have a tie, take them all in the array.
310				if (a.size > max_size)
311					ccv_array_clear(opt), max_size = a.size;
312				ccv_array_push(opt, &a);
313			}
314		assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
 ({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 314, __extension__ __PRETTY_FUNCTION__
); }));
315		// Order opt array by the oc because type and size should be equal at this point.
316		_ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
317		// Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
318		int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
319		uint64_t min_val[2] = {
320			0, 0
321		};
322		if (j > 0)
323		{
324			for (i = 0; i < opt->rnum; i++)
325			{
326				ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(i)));
327				if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
328					continue;
329				// Now, determine the order between a and c. After this, we can always check whether y
330				// can hop to the earliest one and if the latest one can hop to x.
331				// The earliest one will be called p and the latest one will be called q.
332				int p = a.index;
333				int q = a.index;
334				if (tensor_blocks[a.index].companion_ref)
335				{
336					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
337					if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
338						continue;
339					const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
340					if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
341						p = companion_ref;
342					else {
343						const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
344						if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
345							q = companion_ref;
346						else { // Otherwise, b is in between p and q.
347							const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
348							const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
349							assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
 && b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
 : 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
 0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 349, __extension__ __PRETTY_FUNCTION__
); }));
350						}
351					}
352				}
353				assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 353, __extension__ __PRETTY_FUNCTION__
); }));
354				const int type = tensor_blocks[p].type;
355				// y is always earlier than x, but this is hard to assert now.
356				// If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
357				// Thus, the hop between y and x (through a) should be smallest ones.
358				// We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
359				// out of q. For these nodes, we try to verify whether they form a connection (by checking against
360				// alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
361				int y_size = 0;
362				ccv_nnc_tensor_hop_t* const y_buf = buf;
363#define for_block(y, val) do { \
364					if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \
365						y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
366							.idx = y + 1, .hop = ((int*)val)[0] \
367						}; \
368				} while(0)
369				ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
370				if (y_vector)
371					CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
372#undef for_block
373				assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
 ({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 373, __extension__ __PRETTY_FUNCTION__); }));
374				int x_size = 0;
375				ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
376#define for_block(x, val) do { \
377					if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \
378						x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
379							.idx = x + 1, .hop = ((int*)val)[0] \
380						}; \
381				} while(0)
382				ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
383				if (x_vector)
384					CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
385#undef for_block
386				assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
 : 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 386, __extension__ __PRETTY_FUNCTION__
); }));
387				int x, y;
388				if (y_size > 1)
389					_ccv_nnc_sort_by_hops(y_buf, y_size, 0);
390				for (y = 0; y < y_size; y++)
391				{
392					const int hop = exec_dep_rows + y_buf[y].hop;
393					if (hop >= min_hop)
394						break;
395					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
396					if (val.u64 && val.u64[0] >= a.size)
397					{
398						min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
399							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
400						break;
401					}
402				}
403				if (x_size > 1)
404					_ccv_nnc_sort_by_hops(x_buf, x_size, 0);
405				for (x = 0; x < x_size; x++)
406				{
407					const int hop = exec_dep_rows + x_buf[x].hop;
408					if (hop >= min_hop)
409						break;
410					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
411					if (val.u64 && val.u64[0] >= a.size)
412					{
413						min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
414							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
415						break;
416					}
417				}
418				if (x_size > 0)
419				{
420					const int x_min_hop = x_buf[0].hop;
421					for (y = 0; y < y_size; y++)
422					{
423						const int y_hop_p_v = y_buf[y].hop;
424						if (y_hop_p_v + x_min_hop >= min_hop)
425							break;
426						ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
427						if (y_vector)
428						{
429							for (x = 0; x < x_size; x++)
430							{
431								const int q_hop_x_v = x_buf[x].hop;
432								const int hop = y_hop_p_v + q_hop_x_v;
433								if (hop >= min_hop)
434									break;
435								const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
436								if (val.u64 && val.u64[0] >= a.size)
437								{
438									min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
439										min_val[0] = val.u64[0], min_val[1] = val.u64[1];
440									break;
441								}
442							}
443						}
444					}
445				}
446				// If I found a place, stop, and exit.
447				if (min_y > 0 || min_x < tensor_block_size + 1)
448				{
449					min_i = i;
450					break;
451				}
452				// There is no space to insert this block, mark it as such.
453				tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
454				if (tensor_blocks[a.index].companion_ref)
455				{
456					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
457					tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
458				}
459			}
460		}
461		// If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
462		// and default to largest size available.
463		ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))));
464		if (min_i == -1)
465		{
466			allocated_size[num_assigned] = a.size;
467			++num_assigned;
468		}
469		int assign_group = num_assigned;
470		if (min_y > 0)
471		{
472			assign_group = assigned[min_y - 1];
473			// The y and x should belong to the same assigned group.
474			assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
 - 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
 tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 474, __extension__ __PRETTY_FUNCTION__
); }));
475		} else if (min_x < tensor_block_size + 1)
476			assign_group = assigned[min_x - 1];
477		// If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
478		if (min_y != 0 || min_x != tensor_block_size + 1)
479		{
480			uint64_t val[2] = {
481				min_val[0], min_val[1]
482			};
483			assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
 ({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 483, __extension__ __PRETTY_FUNCTION__
); }));
484			val[0] -= a.size;
485			val[1] = val[1] + a.size; // Move the offset to the next one.
486			ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
487		}
488		int strings[3];
489		strings[0] = a.index + 1;
490		int string_size = 1;
491		// Assign out designated companion if it exist.
492		if (tensor_blocks[a.index].companion_ref)
493		{
494			const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
495			assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
 ("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 495, __extension__ __PRETTY_FUNCTION__
); }));
496			const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
497			if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
498			{
499				for (i = 0; i < string_size; i++)
500					strings[i + 1] = strings[i];
501				strings[0] = companion_ref + 1;
502			} else {
503				const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
504				if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
505					strings[string_size] = companion_ref + 1;
506				else {
507					// Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
508					assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
 if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 508, __extension__ __PRETTY_FUNCTION__
); }));
509					strings[2] = strings[1];
510					strings[1] = companion_ref + 1;
511				}
512			}
513			++string_size;
514		}
515		// Assign out and update oc.
516		for (i = 0; i < string_size; i++)
517		{
518			const int index = strings[i] - 1;
519			// Assign out the selected one.
520			assigned[index] = assign_group;
521			// The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
522			allocated_offset[index] = min_val[1];
523			if (adj[index].itf)
524				for (k = 0; k < adj[index].itf->rnum; k++)
525				{
526					const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[
index].itf)->rsize * (size_t)(k)));
527					if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !(
(tensor_blocks[d].flags & 0x3) == UNASSIGNED)))
528						--adj[d].oc;
529				}
530		}
531		uint64_t val[2] = {
532			a.size, min_val[1]
533		};
534		uint64_t consumed_size = 0;
535		// Go over from min_y to string_size (excluding min_x).
536		for (i = 0; i < string_size; i++)
537		{
538			const uint64_t size = tensor_blocks[strings[i] - 1].size;
539			assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 539, __extension__ __PRETTY_FUNCTION__
); }));
540			// Update consumed size if it is bigger than "size".
541			if (size > consumed_size)
542			{
543				val[0] = size - consumed_size;
544				ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
545				consumed_size = size;
546				val[1] = min_val[1] + consumed_size;
547			}
548			// If it consumed all the flow, break out.
549			if (consumed_size == a.size)
550				break;
551		}
552		for (i = 0; i < string_size; i++)
553		{
554			const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
555			uint64_t val[2] = {
556				i_size, min_val[1]
557			};
558			uint64_t consumed_size = 0;
559			for (k = i + 1; k < string_size; k++)
560			{
561				const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
 < _b) ? _a : _b; });
562				// Update consumed size if it is bigger than "size".
563				if (size > consumed_size)
564				{
565					val[0] = size - consumed_size;
566					ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
567					consumed_size = size;
568					val[1] = min_val[1] + consumed_size;
569				}
570				// If it consumed all the flow, break out.
571				if (consumed_size == i_size)
572					break;
573			}
574			val[0] = i_size - consumed_size;
575			// Still have residual, flow it to min_x.
576			if (val[0] > 0)
577				ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
578		}
579		if (min_i == -1)
580		{
581			// If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
582			const int p = strings[0] - 1;
583			const int q = strings[string_size - 1] - 1;
584			const int type = tensor_blocks[p].type;
585#define for_block(y, val) do { \
586				if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \
587				{ \
588					tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
589					if (tensor_blocks[y].companion_ref) \
590					{ \
591						const int companion_ref = tensor_blocks[y].companion_ref - 1; \
592						tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
593					} \
594				} \
595			} while(0)
596			ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
597			if (y_vector)
598				CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
599#undef for_block
600#define for_block(x, val) do { \
601				if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \
602				{ \
603					tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
604					if (tensor_blocks[x].companion_ref) \
605					{ \
606						const int companion_ref = tensor_blocks[x].companion_ref - 1; \
607						tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
608					} \
609				} \
610			} while(0)
611			ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
612			if (x_vector)
613				CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
614#undef for_block
615		}
616		j += string_size;
617	}
618	ccfreefree(tensor_block_cannot_insert);
619	ccfreefree(buf);
620	ccv_array_free(opt);
621	ccv_matrix_free(tensor_df);
622	ccv_matrix_free(tensor_dt);
623#define for_block(y, x, val) do { \
624		if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
625		{ \
626			if (!alloc_dep[x - 1]) \
627				alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
628			ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
629		} \
630	} while (0)
631	CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
 ((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
 _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
 = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
 _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
 !_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
 (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
 { switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
 } while (0);
632#undef for_block
633	ccv_matrix_free(alloc);
634	for (i = 0; i < tensor_block_size; i++)
635		if (adj[i].itf)
636			ccv_array_free(adj[i].itf);
637	ccfreefree(adj);
638	ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
639	alloc_prep->alloc_dep = alloc_dep;
640	alloc_prep->vt_block_size = tensor_block_size;
641	alloc_prep->buffer_size = num_assigned;
642	alloc_prep->block_size = available_tensor_size;
643	alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
644	alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
645	alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
646	memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
647	for (i = 0; i < num_assigned; i++)
648		alloc_prep->buffers[i].size = allocated_size[i];
649	if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels()))
650	{
651		size_t total_size = 0;
652		for (i = 0; i < num_assigned; i++)
653			total_size += allocated_size[i];
654		PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf
("Total buffer size of %zu to be allocated\n", total_size); fflush
(stdout); } } while (0);
655	}
656	ccfreefree(allocated_size);
657	j = 0;
658	// Assigning out the tensors (in case of sharing tensors / in-place ops).
659	for (i = 0; i < tensor_block_size; i++)
660		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
661		{
662			alloc_prep->blocks[j].block_ref = i;
663			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
664			{
665				alloc_prep->vt_blocks[i] = j;
666				// Also, set its allocations.
667				assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 667, __extension__ __PRETTY_FUNCTION__
); }));
668				const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
669				alloc_prep->blocks[j].offset = allocated_offset[i];
670				if (!alloc_prep->buffers[buffer_ref].type)
671					alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
672				alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
673				alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
674				assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
 alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
 ({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 674, __extension__ __PRETTY_FUNCTION__
); }));
675			} else {
676				alloc_prep->vt_blocks[i] = -1;
677				alloc_prep->blocks[j].buffer_ref = -1;
678				alloc_prep->blocks[j].offset = 0;
679			}
680			++j;
681		} else
682			alloc_prep->vt_blocks[i] = -1;
683	ccfreefree(allocated_offset);
684	ccfreefree(assigned);
685	return alloc_prep;
686}
687 
688static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
689{
690	int i;
691	for (i = 0; i < alloc_prep->vt_block_size; i++)
692		if (alloc_prep->alloc_dep[i])
693			ccv_array_free(alloc_prep->alloc_dep[i]);
694	for (i = 0; i < alloc_prep->buffer_size; i++)
695		if (alloc_prep->buffers[i].dup_p_refs)
696			ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
697	ccfreefree(alloc_prep->alloc_dep);
698	ccfreefree(alloc_prep);
699}
700 
701// Simple allocator from ccv_array_t.
702static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
703{
704	int pos = tensor_metadata->rnum;
705	int rsize = (size + 15) / 16;
706	ccv_array_resize(tensor_metadata, pos + rsize);
707	return (pos << 1) + 1;
708}
709 
710static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
711{
712	assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 712, __extension__ __PRETTY_FUNCTION__
); }));
713	return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)));
714}
715 
716#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
717 
718static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
719{
720	// If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
721	if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
722		return vt_tensor;
723	ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
724	if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
725	{
726		const int alias_ref = tensor->alias_ref;
727		tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
728		_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
729	}
730	if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
731	{
732		ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
733		int i;
734		const int count = mv->kind + mv->repeat;
735		for (i = 0; i < count; i++)
736		{
737			if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1))
738			{
739				const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
740				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
741				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
742			}
743		}
744		// No need to recursively do parent pointer, otherwise we are in deep rewire.
745		if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
746			mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
747		if (mv->sp)
748			for (i = 0; i < mv->sp->rnum; i++)
749			{
750				ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
751				if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
752				{
753					const int pos = (int)(intptr_t)*tensor;
754					*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
755					assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 755, __extension__ __PRETTY_FUNCTION__
); }));
756					_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
757				}
758			}
759	}
760	return tensor;
761}
762 
763typedef struct {
764	const uint8_t* ptr;
765	int pos;
766} ccv_nnc_tensor_block_pos_t;
767 
768static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
769{
770	int i;
771	int unref_block_ref = block_ref;
772	while (prep->tensor_blocks[unref_block_ref].ref)
773		unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
774	int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
775	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 775, __extension__ __PRETTY_FUNCTION__); }));
776	assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
 == prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
 ("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 776, __extension__ __PRETTY_FUNCTION__
); }));
777	const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
778	uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
779	int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
780	for (i = idx - 1; i >= 0; i--)
781	{
782		assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
 (p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 782, __extension__ __PRETTY_FUNCTION__); }));
783		const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
784		const int unroll_count = graph_prep->unroll_count;
785		if (ch[i]) // Prefer the dup side of things.
786			p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
787		int unref_p_ref = p_ref;
788		while (graph_prep->tensor_blocks[unref_p_ref].ref)
789			unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
790		vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
791		const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
792		offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
793		// If the buffer already exists, prefer that.
794		const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
795		if (ptr)
796		{
797			// If I have any remaining path that is not covered from 0, I cannot possibly
798			// have any pointer from buffer (that can only happen if it is not dup).
799			for (--i; i >= 0; i--)
800				if (ch[i] != 0)
801					return 0;
802			// Try to find the created tensor block pos in the array, just linear scan.
803			const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
804			ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
805			*tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
806			ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
807			return tv_pos;
808		}
809		p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
810	}
811	return 0;
812}
813 
814// Descent from root to the prep level, and compose multiview from there.
815static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
816{
817	assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 817, __extension__ __PRETTY_FUNCTION__); }));
818	int i;
819	const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
820	const int unroll_count = prep->unroll_count;
821	if (prep == graph_prep)
822	{
823		const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
824		if (!data_pos)
825			return -1;
826		// Based on ch, go all the way back to find the exact pointer to compose.
827		if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
828			prep->dup_tensor_block_ref &&
829			prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
830			prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
831		{
832			int pos[unroll_count + 1];
833			pos[0] = data_pos;
834			for (i = 0; i < unroll_count; i++)
835				pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
836			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
837			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
838			ccv_nnc_tensor_t* data[unroll_count + 1];
839			for (i = 0; i < unroll_count + 1; i++)
840				data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
841			ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
842			for (i = 0; i < unroll_count + 1; i++)
843				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
844			*pos_ref = mv_pos;
845		} else {
846			*pos_ref = data_pos;
847		}
848		if (preserve)
849		{
850			// If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
851			// at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
852			// mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
853			// mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
854			// arena allocated).
855			// mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
856			// a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
857			// it to a K01 structure.
858			// Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
859			// to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
860			// memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
861			int prev_mv_pos = *pos_ref;
862			if (prev_mv_pos == -1)
863			{
864				prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
865				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
866				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
867				ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
868					tv,
869				}, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
870				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
871			}
872			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
873			ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
874			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
875			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
876				CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
877				(ccv_nnc_tensor_t*)prev_mv,
878			}, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
879			prev_mv->p = (void*)(intptr_t)mv_pos;
880			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
881			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
882			*pos_ref = mv_pos;
883		}
884		return 0;
885	}
886	ch[idx] = 0;
887	int pos[unroll_count + 1];
888	pos[0] = 0;
889	const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
890	assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 890, __extension__ __PRETTY_FUNCTION__); }));
891	for (i = 0; i < unroll_count; i++)
892	{
893		ch[idx] = i + 1;
894		pos[i + 1] = 0;
895		const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
896		if (dup_retval < 0)
897		{
898			assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 898, __extension__ __PRETTY_FUNCTION__); }));
899			break;
900		}
901	}
902	// If current prep has no dup.
903	if (i == 0)
904	{
905		*pos_ref = pos[0];
906		return 0;
907	}
908	ccv_nnc_tensor_t* data[unroll_count + 1];
909	// Compose to a new multiview.
910	for (i = 0; i < unroll_count + 1; i++)
911		{ assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
 (pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 911, __extension__ __PRETTY_FUNCTION__); })); }
912	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
913	for (i = 0; i < unroll_count + 1; i++)
914		data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
915	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
916	ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
917	for (i = 0; i < unroll_count + 1; i++)
918		if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
919			((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
920	for (i = 0; i < unroll_count + 1; i++)
921		CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
922	*pos_ref = mv_pos;
923	return 0;
924}
925 
926static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
927{
928	int i;
929	int is_input = 0;
930	assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
 else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 930, __extension__ __PRETTY_FUNCTION__); }));
931	for (i = 0; i < node->input_size && !is_input; i++)
932		if (p_ref == node->inputs[i])
933			is_input = 1;
934	int is_output = 0;
935	for (i = 0; i < node->output_size && !is_output; i++)
936		if (p_ref == node->outputs[i])
937			is_output = 1;
938	// Prefer it is an output if it is both the input and the output.
939	if (is_output)
940		return 1;
941	if (is_input)
942		return -1;
943	return 0;
944}
945 
946static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
947{
948	// No need to check whether to preserve if this is not a while loop.
949	if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
950		return 0;
951	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 951, __extension__ __PRETTY_FUNCTION__
); }));
952	// If it is unassigned, no need to preserve.
953	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
954		return 0;
955	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
956	// If p is not input, no need to preserve at all.
957	if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
958		return 0;
959	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
960	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 960, __extension__ __PRETTY_FUNCTION__); }));
961	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 961, __extension__ __PRETTY_FUNCTION__
); }));
962	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
963	// If the buffer is a truly read-only one, no need to preserve.
964	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
965		return 0;
966	/* This needs detailed explanation, what does preserve mean?
967	 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
968	 * also used outside of the while loop, we cannot reuse the memory region of x for
969	 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
970	 * y uses the same memory region as x). The way to workaround this is by using a different
971	 * memory region for y = x + 1, but for the first iteration, having x pointing to the
972	 * original. During the allocation process, the way to identify whether x should preserve
973	 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
974	 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
975	 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
976	 * it is the input tensor whenever that is possible. A tensor block can point to two parent
977	 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
978	 * tensor whenever that is possible. */
979	if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
980		return 0;
981	// Otherwise, return 1 because we now need to preserve.
982	return 1;
983}
984 
985static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
986{
987	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 987, __extension__ __PRETTY_FUNCTION__
); }));
988	// If it is unassigned, no need to preserve.
989	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
990		return 0;
991	// Only tape var need to force broadcast, otherwise we already share the same memory region.
992	if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
993		return 0;
994	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
995	// If p is not output, no need to broadcast at all.
996	if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
997		return 0;
998	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
999	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 999, __extension__ __PRETTY_FUNCTION__); }));
1000	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1000, __extension__ __PRETTY_FUNCTION__
); }));
1001	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
1002	// If the buffer is a truly read-only one, no need to broadcast.
1003	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
1004		return 0;
1005	// Otherwise, return 1 because we now need to force broadcast for this tape var.
1006	return 1;
1007}
1008 
1009static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
1010{
1011	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1011, __extension__ __PRETTY_FUNCTION__); }));
1012	int i;
1013	for (i = 0; i < mv->kind + mv->repeat; i++)
1014		if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
1015			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = tensor;
1016		else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
1017			_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i], tensor);
1018}
1019 
1020static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1021{
1022	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1022, __extension__ __PRETTY_FUNCTION__); }));
1023	int i;
1024	if (mv->sp)
1025		for (i = 0; i < mv->sp->rnum; i++)
1026		{
1027			ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
1028			if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
1029			{
1030				const int pos = (int)(intptr_t)*tensor;
1031				*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1032				assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 1032, __extension__ __PRETTY_FUNCTION__
); }));
1033				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1034			}
1035		}
1036	for (i = 0; i < mv->kind + mv->repeat; i++)
1037	{
1038		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]) & 1))
1039			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1040		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]->alias_ref) & 1))
1041			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref);
1042		if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
1043			_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1044	}
1045}
1046 
1047static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1048{
1049	// Go to the root of the graph.
1050	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1051	int i;
1052	for (i = 1; prep->p; i++)
1053		prep = prep->p;
1054	// Root graph should have no dup tensor blocks.
1055	assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
 ({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1055, __extension__ __PRETTY_FUNCTION__); }));
1056	const int c = i;
1057	const ccv_nnc_symbolic_graph_prep_t* preps[c];
1058	prep = graph_prep;
1059	preps[c - 1] = prep;
1060	for (i = 0; prep->p; i++)
1061		preps[c - 2 - i] = prep = prep->p;
1062	int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1063	memset(ch, 0, sizeof(int) * c);
1064	int pos = 0;
1065	_ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1066	assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
 (ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1066, __extension__ __PRETTY_FUNCTION__); })); // This shouldn't never be modified.
1067	assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
 > 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1067, __extension__ __PRETTY_FUNCTION__); }));
1068	return pos;
1069}
1070 
1071static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1072{
1073	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1074	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1075	ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1076	ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1077		CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1078		tv,
1079	}, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1080	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1081	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = tensor;
1082	return mv_pos;
1083}
1084 
1085static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1086{
1087	ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1088	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1089	if (!is_multiview)
1090		return pos;
1091	while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1092	{
1093		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1094		tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1095	}
1096	const ccv_nnc_tensor_t tensor = *tensor_ptr;
1097	const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1098	ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1099	*new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1100	new_tensor->dataof = tensor.dataof;
1101	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1102	new_tensor->alias_ref = (uintptr_t)pos;
1103	ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1104	return new_pos;
1105}
1106 
1107static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1108{
1109	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1110	// It referenced to is not an alias.
1111	assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__
 ({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1111, __extension__ __PRETTY_FUNCTION__
); }));
1112	const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1113	const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1114	assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1114, __extension__ __PRETTY_FUNCTION__
); }));
1115	// Will use that to determine whether insert reference or not.
1116	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1117	while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1118	{
1119		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1120		alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1121	}
1122	const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1123	// If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1124	int pos;
1125	if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1126		ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim))
1127	{
1128		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1129		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1130		*tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1131		tensor->dataof = alias_tensor.dataof;
1132	} else {
1133		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1134		ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1135		// Otherwise initialize a tensor view
1136		*tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1137		tensor_view->alias_ref = (uintptr_t)alias_pos;
1138	}
1139	vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1140	if (is_multiview)
1141	{
1142		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1143		ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1144	}
1145}
1146 
1147static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1148{
1149	// If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1150	if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref])
1151	{
1152		const int ref = tensor_blocks[block_ref].alias_ref - 1;
1153		if (!vt_tensors[ref])
1154			_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1155		vt_tensors[block_ref] = vt_tensors[ref];
1156		return;
1157	}
1158	assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1158, __extension__ __PRETTY_FUNCTION__
); }));
1159	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1160	// If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1161	if (!vt_tensors[alias_ref])
1162		_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1163	_ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1164}
1165 
1166// Turn a linear pointer to an object storage (such as MTLBuffer).
1167#ifdef HAVE_MPS
1168static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1169{
1170	mpobjfree(0, ptr);
1171}
1172#endif
1173 
1174typedef struct {
1175	size_t size;
1176	void* obj;
1177} tensor_arena_obj_track_t;
1178 
1179typedef struct {
1180	void* ptr;
1181	off_t offset;
1182	size_t size;
1183} obj_ptr_key_t;
1184 
1185static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1186{
1187	return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1188}
1189 
1190static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1191{
1192	return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1193}
1194 
1195KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied
, upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* *
vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__
)) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t
*)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__
 ((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if
 (h) { free((void *)h->keys); free(h->flags); free((void
 *)h->vals); free(h); } } static inline __attribute__ ((__unused__
)) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h
->flags) { memset(h->flags, 0xaa, ((h->n_buckets) <
 16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t
 key) { if (h->n_buckets) { khint_t k, i, last, mask, step
 = 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key
); i = k & mask; last = i; while (!((h->flags[i>>
4]>>((i&0xfU)<<1))&2) && (((h->
flags[i>>4]>>((i&0xfU)<<1))&1) || !
_kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step
)) & mask; if (i == last) return h->n_buckets; } return
 ((h->flags[i>>4]>>((i&0xfU)<<1))&
3)? h->n_buckets : i; } else return 0; } static inline __attribute__
 ((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t
 new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (
--(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1,
 (new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|=
(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>>
8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
 >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys
 = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets *
 sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return
 -1; } h->keys = new_keys; if (1) { void* *new_vals = (void
**)realloc((void *)h->vals,new_n_buckets * sizeof(void*));
 if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals
; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if
 (((h->flags[j>>4]>>((j&0xfU)<<1))&
3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t
 new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals
[j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<<
1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t
 tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp
; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<
1)); } else { h->keys[i] = key; if (1) h->vals[i] = val
; break; } } } } if (h->n_buckets > new_n_buckets) { h->
keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets
 * sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc
((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h->
flags); h->flags = new_flags; h->n_buckets = new_n_buckets
; h->n_occupied = h->size; h->upper_bound = (khint_t
)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static
 inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t
 *h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied
 >= h->upper_bound) { if (h->n_buckets > (h->size
<<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) <
 0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr
(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets
; } } { khint_t k, i, site, last, mask = h->n_buckets - 1,
 step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
 (!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key
))) { if (((h->flags[i>>4]>>((i&0xfU)<<
1))&1)) site = i; i = (i + (++step)) & mask; if (i ==
 last) { x = site; break; } } if (x == h->n_buckets) { if (
((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && site != h->n_buckets) x = site; else x = i; }
 } } if (((h->flags[x>>4]>>((x&0xfU)<<
1))&2)) { h->keys[x] = key; (h->flags[x>>4]&=
~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->
n_occupied; *ret = 1; } else if (((h->flags[x>>4]>>
((x&0xfU)<<1))&1)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; *ret = 2; } else *ret = 0; return x; } static
 inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t
 *h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
 --h->size; } }
1196 
1197static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1198{
1199	if (params.dim[0] == 0)
1200		return 0;
1201#ifdef HAVE_MPS
1202	if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY)
1203	{
1204		int ret;
1205		const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >>
 12] * ccv_nnc_tensor_count(params);
1206		const obj_ptr_key_t key = {
1207			.ptr = ptr,
1208			.offset = offset,
1209			.size = size,
1210		};
1211		khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret);
1212		if (ret != 0)
1213		{
1214			void* obj = mpobjcreate(ptr, offset, size);
1215			if (!tensor_arena->disposers)
1216				tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1217			ccv_nnc_arena_disposer_t disposer = {
1218				.ptr = obj,
1219				.userdata = 0,
1220				.dispose = _ccv_nnc_tensor_arena_obj_dispose
1221			};
1222			ccv_array_push(tensor_arena->disposers, &disposer);
1223			kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj;
1224			return obj;
1225		} else
1226			return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]);
1227	}
1228#endif
1229	return ptr + offset;
1230}
1231 
1232static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1233{
1234	// All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1235	// Each tensor have the designation in assigned array, and offset in allocated_offset.
1236	const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1237	ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1238	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1239	const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1240	const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1241	const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1242	const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1243	const int unroll_count = graph_prep->unroll_count;
1244	int i, j;
1245	for (i = 0; i < tensor_symbol_info_size; i++)
1246		for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1247		{
1248			const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1249			if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1250				TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
);
1251		}
1252	ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1253	graph_prep->tensor_arena = tensor_arena;
1254	tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1255	tensor_arena->buffers = (void*)(tensor_arena + 1);
1256	tensor_arena->buffer_size = alloc_prep->buffer_size;
1257	tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1258	tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1259	tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1260	tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1261	tensor_arena->pb_vt_tensors = 0;
1262	tensor_arena->vt_alias_r_refs_p = 0;
1263	tensor_arena->vt_alias_r_refs = 0;
1264	tensor_arena->vt_sizes = 0;
1265	tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1266	tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, (sizeof(ccv_nnc_tensor_t) * tensor_symbol_info_size + 15) / 16, 0);
1267	tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1268	tensor_arena->allocator.context.free = allocator.context.free;
1269	tensor_arena->allocator.isa = allocator.isa;
1270	tensor_arena->disposers = 0;
1271	// Copy alias_ref info back to the tensor arena.
1272	for (i = 0; i < tensor_symbol_info_size; i++)
1273		tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1274	// Do the buffer copies.
1275	for (i = 0; i < alloc_prep->buffer_size; i++)
1276		tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1277			tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1278			tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1279	if (graph_prep->while_count_tensor)
1280	{
1281		// If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1282		int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1283		assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
 ({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1283, __extension__ __PRETTY_FUNCTION__
); })); // pos must be 0 position.
1284		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1285		*tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1286	}
1287	assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
 && !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
 && p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1287, __extension__ __PRETTY_FUNCTION__
); }));
1288	if (p_arena && p_graph_prep)
1289	{
1290		// Don't need to allocate the actual buffer, just use the pointer from the above.
1291		PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer assignment for sub arena %p (parent %p)\n",
 tensor_arena, p_arena); fflush(stdout); } } while (0);
1292		for (i = 0; i < tensor_arena->buffer_size; i++)
1293		{
1294			const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1295			int unref_p_ref = p_ref;
1296			while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1297				unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1298			assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
 ({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1298, __extension__ __PRETTY_FUNCTION__
); }));
1299			const int p_unroll_count = p_graph_prep->unroll_count;
1300			if (p_graph_prep->dup_tensor_block_ref &&
1301				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1302				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1303			{
1304				// This condition means in the parent graph, we point to multiple tensor blocks for the same
1305				// buffer, therefore, we cannot have one single pointer assigned in this case.
1306				// Later we will handle this by generate ccv_tensor_multiview_t structure.
1307				tensor_arena->buffers[i].ptr = 0;
1308				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1309				continue;
1310			}
1311			// Otherwise, find the actual buffer pointer.
1312			const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1313			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1313, __extension__ __PRETTY_FUNCTION__); }));
1314			const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1315			if (!p_arena->buffers[buffer_ref].ptr)
1316			{
1317				// Pass it down as 0 ptr.
1318				tensor_arena->buffers[i].ptr = 0;
1319				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1320				continue;
1321			}
1322			const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1323			tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1324			PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
 (0);
1325		}
1326	} else {
1327		// Now, allocate actual buffers.
1328		PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0);
1329		for (i = 0; i < tensor_arena->buffer_size; i++)
1330		{
1331			const int buffer_type = tensor_arena->buffers[i].type;
1332			const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1333			if (tensor_arena->buffers[i].size == 0)
1334			{
1335				tensor_arena->buffers[i].ptr = 0;
1336				PRINT(CCV_CLI_VERBOSE, "|-Skip buffer %d with size 0\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Skip buffer %d with size 0\n", i); fflush(stdout
); } } while (0);
1337				continue;
1338			}
1339#ifdef HAVE_CUDA1
1340			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1341			{
1342				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1343				if (allocator.isa && allocator.isa->alloc)
1344					tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1345				else
1346					tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1347				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1348			} else {
1349				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1349, __extension__ __PRETTY_FUNCTION__
); }));
1350				if (tensor_arena->buffers[i].pin_mem)
1351					tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1352				else
1353					ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1354				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1355			}
1356#elif defined(HAVE_MPS)
1357			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1358			{
1359				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1360				// if (allocator.isa && allocator.isa->alloc)
1361				// 	tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1362				// else
1363				tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1364				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1365			} else {
1366				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1366, __extension__ __PRETTY_FUNCTION__
); }));
1367				ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1368				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1369			}
1370#else
1371			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1371, __extension__ __PRETTY_FUNCTION__
); }));
1372			ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1373			PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1374#endif
1375			assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
 ({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
 ("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1375, __extension__ __PRETTY_FUNCTION__); }));
1376		}
1377	}
1378	// Go over sub_preps and allocate arenas for them. Do it this early because
1379	// we may reference tensors from sub arenas, the reason why we need to reference
1380	// tensors from sub arenas is because for output tensors, sub arena's tensor
1381	// will have automatic reference updates.
1382	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1383		if (graph_prep->sub_preps[i])
1384			tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1385		else
1386			tensor_arena->sub_arenas[i] = 0;
1387	memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1388	// Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1389	ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1390#ifdef HAVE_MPS
1391	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr();
1392#else
1393	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0;
1394#endif
1395	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1396		if (tensor_arena->sub_arenas[i])
1397		{
1398			assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
 ({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1398, __extension__ __PRETTY_FUNCTION__
); }));
1399			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1400			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1401			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1402				for (j = 0; j < node->output_size; j++)
1403				{
1404					const int idx = node->outputs[j];
1405					const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1;
1406					assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
 (s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1406, __extension__ __PRETTY_FUNCTION__); }));
1407					ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1408					assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
 ({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
 ("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1408, __extension__ __PRETTY_FUNCTION__); }));
1409					ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1410					// Only assign if it is a multiview tensor.
1411					if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1412						(sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1413						sub_arena_out_tensors[idx] = sub_tensor;
1414				}
1415		}
1416	// Assigning out the tensors (in case of sharing tensors / in-place ops).
1417	for (i = 0; i < tensor_symbol_info_size; i++)
1418		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
1419		{
1420			const int vt_ref = alloc_prep->vt_blocks[i];
1421			const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1422			// Either we have dup_tensor_block_ref in current layer, or we have that in
1423			// previous layer, therefore, cannot really find the buffer ptr.
1424			if (tensor_symbol_info[i].info.dim[0] != 0 &&
1425				(!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1426				((graph_prep->dup_tensor_block_ref &&
1427				  graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1428				  graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1429				 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1430			{
1431				assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1431, __extension__ __PRETTY_FUNCTION__
); })); // This must be in a sub-graph.
1432				// If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1433				if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1434					continue;
1435				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1436				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1437				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1438			} else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1439				// When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1440				const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1441				// If already created, use the same tensor, and continue.
1442				// Having ptr.
1443				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1444				ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1445				// Also, set its allocations.
1446				// Since tensor view is bit compatible with tensor, we can just cast.
1447				void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1448				*tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1449				assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1449, __extension__ __PRETTY_FUNCTION__
); }));
1450				// If we need to force broadcast, we need to wrap it in a multiview.
1451				if (graph_prep->tensor_blocks[i].p_refs[0] &&
1452					_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1453				{
1454					const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1455					ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1456					ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1457					ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1458						tv,
1459					}, 0, 1, graph_prep->graph, mv);
1460					CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1461					pos = mv_pos;
1462					ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1463				}
1464				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1465			}
1466		}
1467#ifdef HAVE_MPS
1468	kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map);
1469#endif
1470	// Handle binded tensors. First handle cases without aliases.
1471	for (i = 0; i < tensor_bind_size; i++)
1472	{
1473		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1473, __extension__ __PRETTY_FUNCTION__
); }));
1474		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1475		if (resolved_symbol.d >= 0)
1476		{
1477			int d = resolved_symbol.d;
1478			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1479				continue;
1480			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1481			// It has nothing to do with alias.
1482			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1483				d = tensor_blocks[d].ref - 1;
1484			// For binded tensors, it shouldn't be assigned yet.
1485			// If it is assigned, the pointer should match the ones from the binded tensor.
1486			// This can only happen if an enforced in-place tensor is binded twice. If that
1487			// happens, we need to make sure it is binded to the same location.
1488			assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1488, __extension__ __PRETTY_FUNCTION__
); }));
1489			// See above assertion.
1490			if (tensor_arena->vt_tensors[d])
1491				continue;
1492			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1493			{
1494				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1495				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1496				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1497				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1498					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1499						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1499, __extension__ __PRETTY_FUNCTION__
); })); }
1500				// It is OK to be just as a whole smaller or equal to the binded one.
1501				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1501, __extension__ __PRETTY_FUNCTION__
); }));
1502				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1503				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1504				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1505			} else {
1506				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1507				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1508				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1509				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1510				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1511				tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1512				tv->dataof = tensor_binds[i].tensor->dataof;
1513				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1514			}
1515		}
1516	}
1517	// Handle binded tensors. We handle alias here so it can reference to binded tensors.
1518	for (i = 0; i < tensor_bind_size; i++)
1519	{
1520		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1520, __extension__ __PRETTY_FUNCTION__
); }));
1521		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1522		if (resolved_symbol.d >= 0)
1523		{
1524			int d = resolved_symbol.d;
1525			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1526				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1527			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1528			// It has nothing to do with alias.
1529			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1530				d = tensor_blocks[d].ref - 1;
1531			if (tensor_arena->vt_tensors[d])
1532				continue;
1533			// Assert original alias has no ofs. Otherwise our binding will be problematic.
1534			for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1535				{ assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j]
 == 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol
.d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1535, __extension__ __PRETTY_FUNCTION__
); })); }
1536			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1537			{
1538				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1539				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1540				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1541				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1542					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1543						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1543, __extension__ __PRETTY_FUNCTION__
); })); }
1544				// It is OK to be just as a whole smaller or equal to the binded one.
1545				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1545, __extension__ __PRETTY_FUNCTION__
); }));
1546				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1547				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1548				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1549			} else {
1550				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1551				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1552				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1553				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1554				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1555				tv->data = tensor_binds[i].tensor->data;
1556				tv->dataof = tensor_binds[i].tensor->dataof;
1557				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1558			}
1559		}
1560	}
1561	// Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1562	// Avoiding refs that actually is an alias.
1563	for (i = 0; i < tensor_symbol_info_size; i++)
1564		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1565		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref)
1566		{
1567			int ref = tensor_blocks[i].ref - 1;
1568			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1569				ref = tensor_blocks[ref].ref - 1;
1570			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1570, __extension__ __PRETTY_FUNCTION__); }));
1571			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1572		}
1573	// Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1574	if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1575	{
1576		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1576, __extension__ __PRETTY_FUNCTION__
); }));
1577		const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1578		const int p_idx = graph_prep->p_idx - 1;
1579		for (i = 0; i < node->input_size; i++)
1580		{
1581			const int idx = node->inputs[i];
1582			int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx))) - 1;
1583			assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
 ({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1583, __extension__ __PRETTY_FUNCTION__); }));
1584			const int vt_ref = alloc_prep->vt_blocks[block_ref];
1585			if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1586				continue;
1587			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1587, __extension__ __PRETTY_FUNCTION__); }));
1588			const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1589			assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1589, __extension__ __PRETTY_FUNCTION__); }));
1590			assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1590, __extension__ __PRETTY_FUNCTION__
); }));
1591			// Either we have dup_tensor_block_ref in current layer, or we have that in
1592			// previous layer, therefore, cannot really find the buffer ptr.
1593			if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1594				((graph_prep->dup_tensor_block_ref &&
1595				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1596				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1597				 !tensor_arena->buffers[buffer_ref].ptr))
1598			{
1599				// We haven't allocated anything for this yet.
1600				assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
 ? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1600, __extension__ __PRETTY_FUNCTION__
); }));
1601				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1602				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1603				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1604			} else {
1605				const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1606				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1607				ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1608			}
1609		}
1610	}
1611	// For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1612	// This created the multi-view tensor to achieve that.
1613	for (i = 0; i < tensor_symbol_info_size; i++)
1614		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1615		{
1616			const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1617			// Create phi multi-view.
1618			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1619			const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1620			const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1621			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1622			ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1623			ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1624			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1625				intv,
1626				outv,
1627			}, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1628			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1629			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1630			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1631			ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1632		}
1633	// Now it is time to handle alias.
1634	for (i = 0; i < alloc_prep->block_size; i++)
1635		if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1636		{
1637			const int block_ref = alloc_prep->blocks[i].block_ref;
1638			if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref])
1639			{
1640				// Assigning out the tensor aliases.
1641				assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1641, __extension__ __PRETTY_FUNCTION__
); }));
1642				_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1643			}
1644		}
1645	// Now assigning out the rest of alias refs.
1646	for (i = 0; i < tensor_symbol_info_size; i++)
1647		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1648		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i])
1649		{
1650			int ref = tensor_blocks[i].alias_ref - 1;
1651			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1651, __extension__ __PRETTY_FUNCTION__); }));
1652			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1653		}
1654	// Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1655	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1656		if (tensor_arena->sub_arenas[i])
1657		{
1658			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1659			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1660			for (j = 0; j < node->input_size; j++)
1661			{
1662				const int idx = node->inputs[j];
1663				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1664				if (s_idx < 0)
1665					continue;
1666				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1667				// Only do the replacement if it is a multi-view tensor.
1668				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1669				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1670				{
1671					// It cannot be binded tensor.
1672					assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1672, __extension__ __PRETTY_FUNCTION__
); }));
1673					const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1674					const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1675					ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1676					// If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1677					// to this tensor.
1678					if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1679					{
1680						const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1681						ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1682						ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1683						ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1684						ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1685						ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
 : (multiview)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]);
1686						while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1687							tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
 ? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]);
1688						*ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1689						ref_tensor->data = tv->data;
1690						ref_tensor->dataof = tv->dataof;
1691						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1692					} else
1693						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1694				}
1695			}
1696		}
1697	// After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1698	// No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1699	// when initialize case..of node, which will take the phi multi-view again.
1700	for (i = 0; i < tensor_symbol_info_size; i++)
1701		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1702		{
1703			assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
 & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1703, __extension__ __PRETTY_FUNCTION__
); }));
1704			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1705			assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1705, __extension__ __PRETTY_FUNCTION__); }));
1706			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1707		}
1708	// rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1709	for (i = 0; i < tensor_symbol_info_size; i++)
1710		if (tensor_arena->vt_tensors[i])
1711			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1712	// Associate multiview tensors from sub arena to the parent.
1713	if (sub_arena_out_tensors)
1714	{
1715		for (i = 0; i < alloc_prep->block_size; i++)
1716			if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1717			{
1718				const int block_ref = alloc_prep->blocks[i].block_ref;
1719				if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1720					continue;
1721				int sub_arena_ref = block_ref;
1722				if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1723				{
1724					// Assigning out the tensor aliases.
1725					assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1725, __extension__ __PRETTY_FUNCTION__
); }));
1726					const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1727					// It referenced to is not an alias.
1728					assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
 0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1728, __extension__ __PRETTY_FUNCTION__
); }));
1729					sub_arena_ref = alias_ref;
1730					if (!sub_arena_out_tensors[sub_arena_ref])
1731						continue;
1732				}
1733				if (!sub_arena_out_tensors[sub_arena_ref])
1734					continue;
1735				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1736				assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1736, __extension__ __PRETTY_FUNCTION__); }));
1737				// This is only possible if the vt_tensors is a phi node.
1738				if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1739				{
1740					// For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1741					ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1742					assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
 ("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1742, __extension__ __PRETTY_FUNCTION__); }));
1743					assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
 ? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1743, __extension__ __PRETTY_FUNCTION__
); }));
1744					CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]->alias_ref = (uintptr_t)mv;
1745					ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]);
1746				} else {
1747					tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1748					ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1749				}
1750			}
1751	}
1752	// Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1753	// 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1754	// 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1755	// Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1756	// to the output of assign_ref tensor.
1757	for (i = 0; i < tensor_symbol_info_size; i++)
1758		if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1759		{
1760			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1761			ccv_nnc_tensor_t* assign_tensor;
1762			if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1763				assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1764			else
1765				assign_tensor = tensor_arena->vt_tensors[assign_ref];
1766			ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1767		}
1768	// After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1769	for (i = 0; i < tensor_bind_size; i++)
1770	{
1771		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1771, __extension__ __PRETTY_FUNCTION__
); }));
1772		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1773		if (resolved_symbol.d >= 0)
1774		{
1775			int d = resolved_symbol.d;
1776			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1777			// It has nothing to do with alias.
1778			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1779				d = tensor_blocks[d].ref - 1;
1780			// Note we don't trace back on alias. This is intentional.
1781			assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 ==
 tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ (
{ if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1781, __extension__ __PRETTY_FUNCTION__
); }));
1782		}
1783	}
1784	if (sub_arena_out_tensors)
1785		ccfreefree(sub_arena_out_tensors);
1786	// Rewire sub arena's tensor references.
1787	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1788		if (tensor_arena->sub_arenas[i])
1789		{
1790			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1791			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1792			for (j = 0; j < node->input_size; j++)
1793			{
1794				const int idx = node->inputs[j];
1795				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1796				if (s_idx < 0)
1797					continue;
1798				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1799				// Only do the replacement if it is a multi-view tensor.
1800				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1801				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1802				{
1803					// This is binded tensor, bind it now.
1804					if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1805						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1806					else
1807						_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1808				}
1809			}
1810		}
1811	return tensor_arena;
1812}
1813 
1814static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1815{
1816	assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
 ; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1816, __extension__ __PRETTY_FUNCTION__); }));
1817	if ((intptr_t)graph == tensor_arena->graph_ref)
1818	{
1819		assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >=
 0 && pair_ref < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1819, __extension__ __PRETTY_FUNCTION__
); }));
1820		return tensor_arena->vt_tensors[pair_ref];
1821	}
1822	int i;
1823	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1824		if (tensor_arena->sub_arenas[i])
1825		{
1826			ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1827			if (tensor)
1828				return tensor;
1829		}
1830	return 0;
1831}
1832 
1833static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1834{
1835	if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1836		tensor->type |= CCV_TAPE_ALLOC;
1837	else {
1838		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1839		mv->type |= CCV_TAPE_ALLOC;
1840		int i;
1841		for (i = 0; i < mv->repeat + mv->kind; i++)
1842			_ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1843	}
1844}
1845 
1846static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1847{
1848	assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
 __assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1848, __extension__ __PRETTY_FUNCTION__
); }));
1849	int i;
1850	for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1851	{
1852		if (graph_prep->tensor_symbol_info[i].pair_ref)
1853		{
1854			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1855			// No need to continue check this if it is from its pair.
1856			continue;
1857		}
1858		if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1859		{
1860			// If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1861			if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
))
1862			{
1863				const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1864				if (vt_ref >= 0 &&
1865					TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc) == READ_ONLY)
1866					continue;
1867			}
1868			_ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1869		}
1870	}
1871	for (i = 0; i < graph_prep->sub_prep_size; i++)
1872		if (graph_prep->sub_preps[i])
1873			_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1874}
1875 
1876static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1877{
1878	int i, found = 0;
1879	// Try to insert head.
1880	ccv_array_t* head = tensor_blocks.head;
1881	assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
 else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1881, __extension__ __PRETTY_FUNCTION__); }));
1882	for (i = 0; i < head->rnum;)
1883	{
1884		const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i)));
1885		if (head_idx == idx)
1886		{
1887			found = 1;
1888			break;
1889		}
1890		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1891		if (cell.i32 && cell.i32[0] > 0)
1892		{
1893			/* If the current node is the parent of the head node, check if we found it or not. */
1894			/* If not found, replace the current one. */
1895			if (!found)
1896			{
1897				found = 1;
1898				*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = idx;
1899			} else {
1900				/* Remove the current one, change the rnum. */
1901				if (i < head->rnum - 1)
1902					*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(head->rnum - 1)));
1903				--head->rnum;
1904				continue;
1905			}
1906		} else {
1907			// If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1908			cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1909			if (cell.i32 && cell.i32[0] > 0)
1910			{
1911				found = 1;
1912				break;
1913			}
1914		}
1915		/* Advancing i. */
1916		++i;
1917	}
1918	/* If not found, push this idx to the end of the array. */
1919	if (!found)
1920		ccv_array_push(head, &idx);
1921	// Try to insert tail.
1922	found = 0;
1923	ccv_array_t* tail = tensor_blocks.tail;
1924	assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
 else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1924, __extension__ __PRETTY_FUNCTION__); }));
1925	for (i = 0; i < tail->rnum;)
1926	{
1927		const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i)));
1928		if (tail_idx == idx)
1929		{
1930			found = 1;
1931			break;
1932		}
1933		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1934		if (cell.i32 && cell.i32[0] > 0)
1935		{
1936			/* If the current node is the child of the tail node, check if we found it or not. */
1937			/* If not found, replace the current one. */
1938			if (!found)
1939			{
1940				found = 1;
1941				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = idx;
1942			} else {
1943				/* Remove the current one, change the rnum. */
1944				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(tail->rnum - 1)));
1945				--tail->rnum;
1946				continue;
1947			}
1948		} else {
1949			// If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1950			cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1951			if (cell.i32 && cell.i32[0] > 0)
1952			{
1953				found = 1;
1954				break;
1955			}
1956		}
1957		/* Advancing i. */
1958		++i;
1959	}
1960	/* If not found, push this idx to the end of the array. */
1961	if (!found)
1962		ccv_array_push(tail, &idx);
1963}
1964 
1965ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1966{
1967	if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1968	{
1969		assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
 0 && symbol.d < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1969, __extension__ __PRETTY_FUNCTION__
); }));
1970		ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1971		if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1972		{
1973			ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1974			while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1975				mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1976			return (ccv_nnc_tensor_t*)mv;
1977		}
1978		return tensor;
1979	}
1980	int i;
1981	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1982		if (tensor_arena->sub_arenas[i])
1983		{
1984			ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1985			if (tensor)
1986				return tensor;
1987		}
1988	return 0;
1989}
1990 
1991ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1992{
1993	if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1994	{
1995		assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
 >= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 1995, __extension__ __PRETTY_FUNCTION__
); }));
1996		return graph_exec_arena->graph_execs[symbol.d];
1997	}
1998	int i;
1999	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
2000		if (graph_exec_arena->sub_arenas[i])
2001		{
2002			ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
2003			if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
2004				return exec;
2005		}
2006	return (ccv_nnc_graph_exec_t){}; // 0.
2007}
2008 
2009ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2010{
2011	return graph_exec_arena->source;
2012}
2013 
2014ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2015{
2016	return graph_exec_arena->destination;
2017}
2018 
2019// Check whether the head is the beginning of this block.
2020static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
2021{
2022	assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
 ({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 2022, __extension__ __PRETTY_FUNCTION__
); }));
2023	return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0))) == head_node);
2024}
2025 
2026// Check whether the tail is the end of this block.
2027static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2028{
2029	assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
 ({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 2029, __extension__ __PRETTY_FUNCTION__
); }));
2030	return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0))) == tail_node);
2031}
2032 
2033// Make two tensor blocks one. Return 1 if that happened.
2034static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2035{
2036	// Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2037	if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
2038		(!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
2039		tensor_blocks[p_ref_0].tail->rnum == 1 &&
2040		tensor_blocks[p_ref_1].head->rnum == 1 &&
2041		tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
2042		*(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
) == *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
))
2043	{
2044		// If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2045		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 2045, __extension__ __PRETTY_FUNCTION__); }));
2046		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 2046, __extension__ __PRETTY_FUNCTION__); }));
2047		ccv_array_free(tensor_blocks[p_ref_0].tail);
2048		tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2049		if (tensor_blocks[p_ref_1].p_refs[0])
2050		{
2051			assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
 0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
 0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2051, __extension__ __PRETTY_FUNCTION__
); })); // It simply cannot have more than one p_refs, otherwise we cannot merge.
2052			if (!tensor_blocks[p_ref_0].p_refs[0])
2053				tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2054			else
2055				tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2056		}
2057		tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2058		TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
 & ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)));
2059		ccv_array_free(tensor_blocks[p_ref_1].head);
2060		if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
2061			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
 | UNFOLDABLE_AS_INPUT));
2062		// Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2063		TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
 & ~0x3) | UNASSIGNED));
2064		tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2065		if (!tensor_blocks[p_ref_0].r_refs)
2066			tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2067		ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2068		tensor_blocks[p_ref_1].size = 0;
2069		tensor_blocks[p_ref_1].head = 0;
2070		tensor_blocks[p_ref_1].tail = 0;
2071		return 1;
2072	}
2073	return 0;
2074}
2075 
2076static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2077{
2078	int i, j, k;
2079	// Generate exec dependencies (or, in other words, partial ordering of executions).
2080	ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2081	int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2082	int buf_size;
2083	if (p_node_info)
2084		{ assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
 if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2084, __extension__ __PRETTY_FUNCTION__
); })); }
2085#define for_block(x, val) \
2086	do { \
2087		if (((int32_t*)val)[0] > 0) \
2088		{ \
2089			buf[buf_size * 2] = x; \
2090			buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2091			++buf_size; \
2092		} \
2093	} while (0)
2094	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx; {
2095		buf_size = 0; /* save all its parent deps to this buffer */
2096		ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2097		if (vector)
2098			CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
2099		if (!node->outgoings)
2100			continue;
2101		for (i = 0; i < node->outgoings->rnum; i++)
2102		{
2103			int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2104			const int32_t one = 1;
2105			ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2106			/* If not found, set, if the current node is the destination node, no need 
2107			 * set itself as parent of subsequent nodes because its terminal nature. */
2108			if (!cell.i32 || cell.i32[0] == 0)
2109				ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2110			if (buf_size > 0)
2111			{
2112				ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2113				assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector
) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c"
, 2113, __extension__ __PRETTY_FUNCTION__); }));
2114				for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
2115				{
2116					ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2117					/* If not found, set */
2118					if (!cell.i32 || cell.i32[0] == 0)
2119						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2120					else {
2121						/* Otherwise, set to the longest one */
2122						int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
 + 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; });
2123						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2124					}
2125				}
2126			}
2127		}
2128	} ccv_nnc_graph_visit_endfor} }
2129#undef for_block
2130	ccfreefree(buf);
2131	// This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2132	const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2133	ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2134	// The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2135	// happens that I have to loop through all relevant node to find out if one is used or not.
2136	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2137		tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2138	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2139		for (i = 0; i < node->input_size; i++)
2140			if (node->inputs[i] >= 0)
2141			{
2142				tensor_blocks[node->inputs[i]].flags = 0;
2143				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2144				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2145				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2146					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2147					tensor_blocks[node->inputs[i]].pin_mem = 1;
2148			}
2149		for (i = 0; i < node->output_size; i++)
2150			if (node->outputs[i] >= 0)
2151			{
2152				tensor_blocks[node->outputs[i]].flags = 0;
2153				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2154				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2155				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2156					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2157					tensor_blocks[node->outputs[i]].pin_mem = 1;
2158			}
2159	} ccv_nnc_graph_visit_endfor} }
2160	if (p_node_info)
2161	{
2162		assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
 ({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 2162, __extension__ __PRETTY_FUNCTION__
); }));
2163		// Mark it as used if it is used in either input or output.
2164		for (i = 0; i < p_node_info->input_size; i++)
2165			if (p_node_info->inputs[i] >= 0)
2166			{
2167				const int d = p_node_info->inputs[i];
2168				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2169				{
2170					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2171					if (dd >= 0) // If this exists in this sub-graph, great.
2172						tensor_blocks[dd].flags = 0;
2173				}
2174			}
2175		for (i = 0; i < p_node_info->output_size; i++)
2176			if (p_node_info->outputs[i] >= 0)
2177			{
2178				const int d = p_node_info->outputs[i];
2179				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2180				{
2181					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2182					if (dd >= 0) // If this exists in this sub-graph, great.
2183						tensor_blocks[dd].flags = 0;
2184				}
2185			}
2186	}
2187	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2188		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2189		{
2190			// Check no tensor info is auto now.
2191			assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 2191, __extension__ __PRETTY_FUNCTION__
); }));
2192			// If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2193			// therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2194			// fold to).
2195			if (tensor_symbol_info[i].assign_ref)
2196			{
2197				// TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2198				// It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2199				// it kept its own representation, which is not the case for output).
2200				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2201				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2202				// But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2203				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT));
2204				// It also cannot be folded as output (except i), because we need to keep its own representation.
2205				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2206				assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
 == 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2206, __extension__ __PRETTY_FUNCTION__
); }));
2207				tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2208				for (j = 0; j < unroll_count; j++)
2209				{
2210					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT));
2211					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT));
2212				}
2213				if (tensor_blocks[assign_ref].bypass_ref)
2214				{
2215					// If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2216					tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2217					const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2218					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT));
2219					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2220					// On the other hand, it can be folded into the except_ref for the bypass_ref.
2221					tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2222					if (dup_tensor_from_ref)
2223					{
2224						const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2225						if (bypass_from_ref >= 0)
2226						{
2227							TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT));
2228							TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT));
2229							assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
 + unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
 - 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2229, __extension__ __PRETTY_FUNCTION__
); }));
2230							for (j = 0; j < unroll_count - 1; j++)
2231							{
2232								// Mark every incarnation as unfold-able.
2233								TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_INPUT));
2234								TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT));
2235							}
2236						}
2237					}
2238				}
2239			}
2240		}
2241	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2242	{
2243		// If it has a pair reference, we don't need to allocate this tensor at all,
2244		// set it to be unassigned.
2245		if (tensor_symbol_info[i].pair_ref)
2246			TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED));
2247		// If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2248		else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2249			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2250			TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2251			// For this case, there is no exception.
2252			tensor_blocks[i].unfoldable_except_ref = 0;
2253		} else if (tensor_symbol_info[i].p_ref) {
2254			assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 2254, __extension__ __PRETTY_FUNCTION__); }));
2255			const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2256			// If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2257			if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2258				// TODO: This check can be lifted if we can fold in the parent graph.
2259				if (-1 == p_ref_is_in_or_out)
2260					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2261			if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2262				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2263		}
2264	}
2265	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2266	{
2267		if (tensor_symbol_info[i].alias_ref)
2268		{
2269			const int ref = tensor_symbol_info[i].alias_ref - 1;
2270			// If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2271			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2272				tensor_blocks[ref].flags = 0;
2273			// An alias cannot ref to another alias.
2274			assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
 __assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2274, __extension__ __PRETTY_FUNCTION__); }));
2275			tensor_blocks[i].flags = ALIAS;
2276			tensor_blocks[i].ref = ref + 1; // Assign the ref.
2277			if (!tensor_blocks[ref].r_refs)
2278				tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2279			ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2280		}
2281	}
2282	// Scan again and if the ref is not assigned, mark the alias not assigned.
2283	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2284		if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2285		{
2286			const int ref = tensor_blocks[i].ref - 1;
2287			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2288			{
2289				// Mark this as unassigned.
2290				tensor_blocks[i].flags = UNASSIGNED;
2291				tensor_blocks[i].ref = 0;
2292			}
2293		}
2294	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2295	{
2296		// If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2297		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
2298		{
2299			tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2300			tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2301			// Cache tensor size (align to 16 bytes).
2302			tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2303		}
2304		// If there is a p_ref, add the one to the p_refs list.
2305		if (tensor_symbol_info[i].p_ref)
2306			tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2307	}
2308	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2309		for (i = 0; i < node->input_size; i++)
2310		{
2311			int d = node->inputs[i];
2312			if (d < 0)
2313				continue;
2314			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2315				d = tensor_symbol_info[d].alias_ref - 1;
2316			tensor_blocks[d].flags |= READ_ONLY;
2317			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2318				continue;
2319			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2319, __extension__ __PRETTY_FUNCTION__
); }));
2320			/* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2321			 * from the very beginning of the graph life-cycle and ends here. */
2322			if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
2323			{
2324				for (j = 0; j < source_size; j++)
2325				{
2326					// If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2327					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2328					if (cell.i32 && cell.i32[0] > 0)
2329						_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2330				}
2331				/* If this is a read-only (based on SSA, if first encountered as read), and this is
2332				 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2333				 * loop, however, in that case, you need to prevent read-only gets reused for the
2334				 * output tensor, which is not obvious how to implement correctly), and it is not
2335				 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2336				 * of memory anyway (because on second loop, we want to read the same value out).
2337				 * Mark it to the end of the graph. */
2338				if (p_node_info && !tensor_symbol_info[d].assign_ref)
2339					for (j = 0; j < destination_size; j++)
2340					{
2341						// If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2342						const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2343						if (cell.i32 && cell.i32[0] > 0)
2344							_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2345					}
2346			}
2347			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2348		}
2349		for (i = 0; i < node->output_size; i++)
2350		{
2351			int d = node->outputs[i];
2352			if (d < 0)
2353				continue;
2354			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2355				d = tensor_symbol_info[d].alias_ref - 1;
2356			tensor_blocks[d].flags |= WRITE_ONLY;
2357			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2358				continue;
2359			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2359, __extension__ __PRETTY_FUNCTION__
); }));
2360			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2361		}
2362	} ccv_nnc_graph_visit_endfor} }
2363	// For any assign_ref, its life-time kept until the end and wrap over.
2364	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2365		// If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2366		// that "somewhere else" need to keep its life-time til the end.
2367		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) &&
2368			p_node_info && tensor_symbol_info[i].assign_ref)
2369		{
2370			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2371			for (j = 0; j < destination_size; j++)
2372			{
2373				// This logic is to be more conservative about which destination we add to.
2374				// As of now, if we add everything, it is fine most likely. However, it may
2375				// cause issues in the future to do so naively. Thus, instead, we only add
2376				// the destination to it iff either the tensor is not used at all, or, the
2377				// destination is on the same stream as of the tensor block some way.
2378				int flag = !tensor_blocks[assign_ref].tail;
2379				for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2380				{
2381					const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
 + (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)));
2382					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2383					flag = (cell.i32 && cell.i32[0] > 0);
2384				}
2385				if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2386					_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2387			}
2388		}
2389	for (i = 0; i < output_size; i++)
2390	{
2391		assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
 __assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2391, __extension__ __PRETTY_FUNCTION__); }));
2392		int d = outputs[i].d;
2393		if (d < 0)
2394			continue;
2395		if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2396			d = tensor_symbol_info[d].alias_ref - 1;
2397		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2398			continue;
2399		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2399, __extension__ __PRETTY_FUNCTION__
); }));
2400		for (j = 0; j < destination_size; j++)
2401		{
2402			int flag = !tensor_blocks[d].tail;
2403			for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2404			{
2405				const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
2406				const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2407				flag = (cell.i32 && cell.i32[0] > 0);
2408			}
2409			if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2410				_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2411		}
2412	}
2413	// Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2414	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2415		int x, y;
2416		for (x = 0; x < node->input_size; x++)
2417			for (y = 0; y < node->output_size; y++)
2418				/* Some operations enforces some tensors to be the same for inputs / outputs. */
2419				if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2420				{
2421					// If both unassigned, it is fine.
2422					if (node->inputs[x] < 0 && node->outputs[y] < 0)
2423						continue;
2424					int ref = node->inputs[x];
2425					assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2425, __extension__ __PRETTY_FUNCTION__); }));
2426					while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2427						ref = tensor_blocks[ref].ref - 1;
2428					const int node_output_y = node->outputs[y];
2429					assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
 ({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2429, __extension__ __PRETTY_FUNCTION__
); }));
2430					// If both are not computable, it is fine, we don't need to enforce.
2431					if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2432						!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
)))
2433						continue;
2434					// Otherwise, enforce and error out if failed.
2435					if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2436						{ assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2436, __extension__ __PRETTY_FUNCTION__
); })); }
2437				}
2438	} ccv_nnc_graph_visit_endfor} }
2439	// Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2440	// we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2441	// that is not enforced in-place (because the tensor enforced in-place will be different than the
2442	// binding one).
2443	for (i = 0; i < tensor_bind_size; i++)
2444	{
2445		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2446		// If there is a tensor binded, then it is unassigned.
2447		if (resolved_symbol.d >= 0)
2448		{
2449			int d = resolved_symbol.d;
2450			// I cannot assert too much at this moment.
2451			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2452				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2453			// This check is for in-place ops. Only in-place op could have unassigned but ref.
2454			// It has nothing to do with alias.
2455			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2456				d = tensor_blocks[d].ref - 1;
2457			// Doesn't work if this is a loop carrying variable.
2458			assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
 __extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
 __assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2458, __extension__ __PRETTY_FUNCTION__); }));
2459			tensor_blocks[d].flags = UNASSIGNED;
2460			tensor_blocks[d].ref = 0; // No need to have ref as well.
2461		}
2462	}
2463	// Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2464	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2465		int x, y;
2466		for (x = 0; x < node->input_size; x++)
2467		{
2468			/* If the input is not assigned, it can be referenced, find the referenced one */
2469			int ref = node->inputs[x];
2470			if (ref < 0)
2471				continue;
2472			const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2473			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2474				ref = tensor_blocks[ref].ref - 1;
2475			assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
 ({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2475, __extension__ __PRETTY_FUNCTION__
); }));
2476			if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2477				tensor_blocks[ref].tail->rnum == 1)
2478			{
2479				for (y = 0; y < node->output_size; y++)
2480					/* Only proceed if the input symbol is different from the output symbol, */
2481					/* and the input symbol meets the output symbol exactly at the same spot. */
2482					if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2483						node->outputs[y] >= 0 &&
2484						ref != node->outputs[y] &&
2485						TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
 0x3) == UNASSIGNED)))
2486					{
2487						const int node_output_y = node->outputs[y];
2488						const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2489						/* If dimension matches perfectly, then we can assign y_symbol to x.
2490						 * If both of them are aliases, making sure their origin matches in size too. */
2491						if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
2492						{
2493							_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2494							// This refers to an alias itself, now mark it and will be processed later.
2495							if (ref != node->inputs[x])
2496								tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2497						}
2498					}
2499			}
2500		}
2501	} ccv_nnc_graph_visit_endfor} }
2502	// Specifically handle the bypass. This need to be done after the first pass.
2503	// I need to extend the bypass life-time to the same as the one I am going with.
2504	// It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2505	ccv_nnc_tensor_block_t empty_block = {};
2506	empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2507	empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2508	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2509		if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2510		{
2511			int can_bypass = 1;
2512			for (i = 0; can_bypass && i < node->output_size; i++)
2513			{
2514				int d = node->outputs[i];
2515				if (d < 0)
2516					continue;
2517				if (!tensor_blocks[d].bypass_ref)
2518					continue;
2519				while (tensor_blocks[d].ref)
2520					d = tensor_blocks[d].ref - 1;
2521				int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2522				while (tensor_blocks[bypass_ref].ref)
2523					bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2524				// If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2525				if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2526					continue;
2527				ccv_array_clear(empty_block.head);
2528				for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2529					ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
 + (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j))));
2530				ccv_array_clear(empty_block.tail);
2531				for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2532					ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
 + (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j))));
2533				for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2534					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), empty_block);
2535				for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2536					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), empty_block);
2537				// It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2538				assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
 ({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
 ("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2538, __extension__ __PRETTY_FUNCTION__
); }));
2539				int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2540				while (tensor_blocks[b_ref].ref)
2541					b_ref = tensor_blocks[b_ref].ref - 1;
2542				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2543				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2544				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2545				// even after we extend the life-time of bypass_ref. Then we are in a good shape.
2546				can_bypass = can_bypass && (a_hop_b || b_hop_a);
2547			}
2548			if (can_bypass)
2549			{
2550				for (i = 0; i < node->output_size; i++)
2551				{
2552					int d = node->outputs[i];
2553					if (d < 0)
2554						continue;
2555					if (!tensor_blocks[d].bypass_ref)
2556						continue;
2557					while (tensor_blocks[d].ref)
2558						d = tensor_blocks[d].ref - 1;
2559					int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2560					while (tensor_blocks[bypass_ref].ref)
2561						bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2562					// The bypass_ref can extend its life-time.
2563					for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2564						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2565					for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2566						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2567				}
2568			} else {
2569				for (i = 0; i < node->output_size; i++)
2570					tensor_blocks[node->outputs[i]].bypass_ref = 0;
2571				const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2572				// Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2573				exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2574			}
2575		}
2576	} ccv_nnc_graph_visit_endfor} }
2577	ccv_array_free(empty_block.head);
2578	ccv_array_free(empty_block.tail);
2579	*r_exec_dep = exec_dep;
2580	*r_tensor_blocks = tensor_blocks;
2581}
2582 
2583static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2584{
2585	if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2586	{
2587		ccv_nnc_cmd_t retval = cmd;
2588		retval.cmd = CCV_NNC_NOOP;
2589		return retval;
2590	}
2591	return cmd;
2592}
2593 
2594static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2595{
2596	if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2597	{
2598		if (tensor_symbol_info[input].alias_ref)
2599		{
2600			const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2601			assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2601, __extension__ __PRETTY_FUNCTION__
); }));
2602			ccv_nnc_tensor_symbol_t tensor_symbol = {};
2603			if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2604			{
2605				tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2606				if (tensor_symbol_info[alias_ref].pair_ref)
2607					ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2608						.d = tensor_symbol_info[alias_ref].pair_ref - 1,
2609						.graph = dup_graph->pair
2610					});
2611				ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2612				dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2613			} else {
2614				tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2615				tensor_symbol.graph = dup_graph;
2616			}
2617			ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2618			if (tensor_symbol_info[input].pair_ref)
2619				ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2620					.d = tensor_symbol_info[input].pair_ref - 1,
2621					.graph = dup_graph->pair
2622				});
2623			ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2624			dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2625		} else {
2626			ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2627			if (tensor_symbol_info[input].pair_ref)
2628				ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2629					.d = tensor_symbol_info[input].pair_ref - 1,
2630					.graph = dup_graph->pair
2631				});
2632			ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2633			dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2634		}
2635		if (tensor_symbol_info[input].bypass_ref)
2636		{
2637			const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2638			assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
 ({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2638, __extension__ __PRETTY_FUNCTION__
); }));
2639			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])));
2640			symbol_info->bypass_ref = dup_bypass_ref + 1;
2641		}
2642	}
2643	return (ccv_nnc_tensor_symbol_t) {
2644		.d = dup_tensor_block_ref[input * unroll_count],
2645		.graph = dup_graph,
2646	};
2647}
2648 
2649static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2650{
2651	int i;
2652	if (dup_exec_ref[idx * unroll_count] < 0)
2653	{
2654		// Input has to come before output, because output could has a bypass reference to the input.
2655		for (i = 0; i < node->input_size; i++)
2656			max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2657		for (i = 0; i < node->output_size; i++)
2658			max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2659		ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2660		dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2661	}
2662	return (ccv_nnc_graph_exec_symbol_t) {
2663		.d = dup_exec_ref[idx * unroll_count],
2664		.graph = dup_graph,
2665	};
2666}
2667 
2668static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2669{
2670	int i;
2671	for (i = 0; i < tensor_block_size; i++)
2672	{
2673		if (tensor_blocks[i].head)
2674			ccv_array_free(tensor_blocks[i].head);
2675		if (tensor_blocks[i].tail)
2676			ccv_array_free(tensor_blocks[i].tail);
2677		if (tensor_blocks[i].r_refs)
2678			ccv_array_free(tensor_blocks[i].r_refs);
2679		if (tensor_blocks[i].dup_p_refs)
2680			ccv_array_free(tensor_blocks[i].dup_p_refs);
2681	}
2682	ccfreefree(tensor_blocks);
2683}
2684 
2685// Find tensors that cannot be solved by co-allocating to the same location.
2686static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2687{
2688	int i, j, unroll_count = 0;
2689	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2690		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2691		{
2692			// This is is a parameter, thus, it has to be either an alias or used.
2693			assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
 & 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
 ("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2693, __extension__ __PRETTY_FUNCTION__
); }));
2694			const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2695			// The parameter it assign to has to be either an alias or used.
2696			assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2696, __extension__ __PRETTY_FUNCTION__
); }));
2697			// If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2698			// If it is the same, we are good, no need to extend.
2699			int a_ref = i;
2700			while (tensor_blocks[a_ref].ref)
2701				a_ref = tensor_blocks[a_ref].ref - 1;
2702			int b_ref = assign_ref;
2703			while (tensor_blocks[b_ref].ref)
2704				b_ref = tensor_blocks[b_ref].ref - 1;
2705			if (a_ref != b_ref)
2706			{
2707				// If any of the b's head is deterministically later than a's tail
2708				// or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2709				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2710				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2711				// It cannot be that both i can hop to j can j can hop to i.
2712				assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
 ? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
 > 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2712, __extension__ __PRETTY_FUNCTION__
); }));
2713				// Can it be folded
2714				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2715				if (a_hop_b || b_hop_a)
2716				{
2717					tensor_blocks[a_ref].companion_ref = b_ref + 1;
2718					tensor_blocks[b_ref].companion_ref = a_ref + 1;
2719					continue;
2720				}
2721				int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2722				for (j = 0; c_ref >= 0; j++)
2723				{
2724					while (tensor_blocks[c_ref].ref)
2725						c_ref = tensor_blocks[c_ref].ref - 1;
2726					c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2727				}
2728				unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
 = (j + 1); (_a > _b) ? _a : _b; });
2729			}
2730		}
2731	// Reset companion_ref if need to unroll.
2732	if (unroll_count)
2733		for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2734			tensor_blocks[j].companion_ref = 0;
2735	return unroll_count;
2736}
2737 
2738static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2739{
2740	int i, j, n;
2741	// The inout exec nodes, these are the nodes we are going to extend.
2742	uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2743	int max_input_size = 0;
2744	int max_output_size = 0;
2745	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2746	{
2747		max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; });
2748		max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; });
2749	}
2750	ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
2751	ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
2752	// Doing graph expansion
2753	// It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2754	assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2754, __extension__ __PRETTY_FUNCTION__
); }));
2755	assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2755, __extension__ __PRETTY_FUNCTION__
); }));
2756#define INCOMING_NODE (1)
2757#define OUTGOING_NODE (2)
2758	// Unroll the graph n times.
2759	for (n = 0; n < unroll_count; n++)
2760	{
2761		int* const dup_exec_ref = r_dup_exec_ref + n;
2762		const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2763		int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2764		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2765			dup_exec_ref[i * unroll_count] = -1;
2766		for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2767		{
2768			// If there is a assign_ref, that means I don't need to dup the tensor.
2769			if (tensor_symbol_info[i].assign_ref)
2770			{
2771				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2772				dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2773			} else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2774			// If this is a read-only tensor block, no need to duplicate because the value never changes
2775			// (note we handled assign_ref first), therefore, no need to generate duplicate.
2776				dup_tensor_block_ref[i * unroll_count] = i;
2777			else
2778				dup_tensor_block_ref[i * unroll_count] = -1;
2779		}
2780		// Go through the original graph, make copies of the node if it is inout.
2781		ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2782			ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2783			inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2784			if (!node->outgoings)
2785				continue;
2786			for (i = 0; i < node->outgoings->rnum; i++)
2787			{
2788				const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2789				inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2790				ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2791				ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2792			}
2793		} ccv_nnc_graph_visit_endfor} }
2794		// Check the visitor are all marked as either incoming or outgoing.
2795		const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2796		const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2797		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2798		{
2799			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2800				continue;
2801			assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
 OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
 INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
 ("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2801, __extension__ __PRETTY_FUNCTION__
); }));
2802			// If this is pure incoming nodes, then I need to concat this one with all original destination node
2803			if (inout[i] == INCOMING_NODE)
2804				for (j = 0; j < dup_destination_size; j++)
2805				{
2806					ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2807						.d = dup_destinations[j].d,
2808						.graph = dup_graph,
2809					}, (ccv_nnc_graph_exec_symbol_t) {
2810						.d = dup_exec_ref[i * unroll_count],
2811						.graph = dup_graph,
2812					});
2813				}
2814		}
2815		if (dup_graph->destinations)
2816			ccv_array_clear(dup_graph->destinations);
2817		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2818		{
2819			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2820				continue;
2821			const int d = dup_exec_ref[i * unroll_count];
2822			ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
 + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)));
2823			// If this has no outgoing node, add to the destination.
2824			if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2825				ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2826					.graph = dup_graph,
2827					.d = d,
2828				});
2829		}
2830	}
2831#undef INCOMING_NODE
2832#undef OUTGOING_NODE
2833	ccfreefree(inout);
2834}
2835 
2836static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2837{
2838	int i;
2839	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2840		// Now can assign them (The dup) as companion.
2841		// Get to the last one, which we will wrap over.
2842		if (dup_tensor_symbol_info[i].assign_ref)
2843		{
2844			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2845			dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2846			assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
 ; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2846, __extension__ __PRETTY_FUNCTION__
); }));
2847			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2848		}
2849}
2850 
2851// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2852// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2853// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2854static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2855{
2856	int i, j, k;
2857	for (i = 0; i < p_node_info->output_size; i++)
2858	{
2859		const int d = p_node_info->outputs[i];
2860		const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx))) - 1;
2861		if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED)))
2862			continue;
2863		for (k = 0; k < destination_size; k++)
2864			_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2865		// Add the duplicated destinations to the tensor_block_ref.
2866		for (j = 0; j < unroll_count; j++)
2867			for (k = 0; k < destination_size; k++)
2868			{
2869				const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2870				const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2871				if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2872					_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2873			}
2874	}
2875}
2876 
2877static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2878{
2879	int i, j;
2880	ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2881	ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2882	// blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2883	// Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2884	// No need to change anything, we are good.
2885	const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2886	if (!unroll_count)
2887		return;
2888	// Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2889	// Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2890	ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2891	int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2892	int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2893	_ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2894	ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2895	ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2896	ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
(dup_graph->exec_symbol_info->rnum) + _incoming_edges_ >
 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
 = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = {
 (dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1
; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for
 (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
 _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue
; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r =
 1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) {
 ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph
->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1
; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_
[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((
_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum
)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->destinations->rnum); _exist_size_[1] =
 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
 for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
 = 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
 int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
 6 && _d_ < (dup_graph->destinations->rnum))
 { _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
 < ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 6 && _d_ <
 (dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0
), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->
rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
 ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
 ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
 ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
 <= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
 ({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2896, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
2897	ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2898	_ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2899	// Free out the old exec_dep
2900	ccv_matrix_free(exec_dep);
2901	// and the tensor blocks, prepare for the new.
2902	_ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2903	// A reverse map to find where the original tensor comes from.
2904	int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2905	for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2906		dup_tensor_from_ref[i] = -1;
2907	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2908		for (j = 0; j < unroll_count; j++)
2909			if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2910				dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2911	int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2912	for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2913		dup_exec_from_ref[i] = -1;
2914	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2915	{
2916		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2917			continue;
2918		dup_exec_from_ref[i] = i; // Reference back.
2919		for (j = 0; j < unroll_count; j++)
2920			if (dup_exec_ref[i * unroll_count + j] >= 0)
2921				dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2922	}
2923	// Reset all attr.
2924	memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2925	_ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2926	ccv_nnc_graph_visit_free(dup_visit);
2927	ccfreefree(dup_exec_symbol_info);
2928	ccfreefree(dup_exec_from_ref);
2929	ccfreefree(dup_tensor_from_ref);
2930	// Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2931	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2932		// Loop over all possible duplications to assign dup_p_ref properly.
2933		for (j = 0; j < unroll_count; j++)
2934		{
2935			const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2936			if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2937			{
2938				const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2939				const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2940				if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2941				{
2942					if (!tensor_blocks[dup_idx].dup_p_refs)
2943						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2944					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2945				}
2946				if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2947					continue;
2948				const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2949				const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2950				if (p_ref_1_is_in_or_out == 1)
2951				{
2952					if (!tensor_blocks[dup_idx].dup_p_refs)
2953						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2954					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2955				}
2956			}
2957		}
2958	// companion_ref
2959	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2960		// Now can assign them (The dup) as companion.
2961		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2962		{
2963			// Get to the last one, which we will wrap over.
2964			const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2965			if (assign_ref >= 0)
2966			{
2967				int b_ref = assign_ref;
2968				while (tensor_blocks[b_ref].ref)
2969					b_ref = tensor_blocks[b_ref].ref - 1;
2970				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2971				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2972				// It cannot be that both i can hop to j can j can hop to i.
2973				// And it can be hop from one to another now after duplication.
2974				assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
 ({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
 ("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2974, __extension__ __PRETTY_FUNCTION__); }));
2975				tensor_blocks[i].companion_ref = b_ref + 1;
2976				tensor_blocks[b_ref].companion_ref = i + 1;
2977			}
2978		}
2979	ccfreefree(dup_tensor_symbol_info);
2980	// Extend the dup tensor block ref, prepare for future extensions.
2981	dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2982	for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2983		dup_tensor_block_ref[i] = -1;
2984	// Assign out changed properties.
2985	*r_exec_dep = exec_dep;
2986	*r_tensor_blocks = tensor_blocks;
2987	*r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2988	*r_dup_graph = dup_graph;
2989	*r_unroll_count = unroll_count;
2990	*r_dup_exec_ref = dup_exec_ref;
2991	*r_dup_tensor_block_ref = dup_tensor_block_ref;
2992}
2993 
2994static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2995{
2996	if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
2997		return tensor_block_size;
2998	int i;
2999	const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
3000	int found_idx = tensor_block_size;
3001	for (i = 0; i < anonymous_block_free_list_cap; i++)
3002	{
3003		const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)));
3004		assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
 ({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 3004, __extension__ __PRETTY_FUNCTION__
); }));
3005		// If the type doesn't match, ignore.
3006		if (tensor_blocks[idx].type != type)
3007			continue;
3008		// Heuristic about how to select the best tensor block to move forward.
3009		// If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
3010		if (tensor_blocks[idx].size >= size)
3011		{
3012			if (no_dup_p_refs)
3013				return idx;
3014			// Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
3015			// then we cannot do better than this, if that is the case, just return.
3016			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
3017				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
3018				return idx;
3019		}
3020		int64_t found_idx_size_diff;
3021		int64_t idx_size_diff;
3022		if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
3023			// Now, compare whether this one or the found_idx one is better.
3024			// At this point, there is no point of comparing the dup_p_refs, we only care about which one
3025			// is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
3026			(found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3027		{
3028			found_idx = idx;
3029			continue;
3030		}
3031		// No need to update if found_idx is better than idx.
3032		if (found_idx_size_diff > idx_size_diff)
3033			continue;
3034		// We bias towards the bigger one in case of similar.
3035		if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3036		{
3037			found_idx = idx;
3038			continue;
3039		}
3040		assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
 == tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 3040, __extension__ __PRETTY_FUNCTION__
); }));
3041		// On a tie, check which one has tighter life-cycle.
3042		if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3043		{
3044			// Check whether the current tensor blocks life-cycle is longer than the previous one.
3045			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3046				(!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3047				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3048				found_idx = idx;
3049			continue;
3050		}
3051		// Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3052		// We prefer to choose the one that has life-cycle closer to the expected ones.
3053		if (no_dup_p_refs)
3054		{
3055			// Whoever is shorter wins.
3056			if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3057				(!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3058				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3059				found_idx = idx;
3060			continue;
3061		}
3062		if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3063			continue;
3064		if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3065		{
3066			found_idx = idx;
3067			continue;
3068		}
3069		// If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3070		const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3071		const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3072		if (idx_after_request && found_idx_after_request)
3073		{
3074			if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3075				found_idx = idx;
3076			continue;
3077		} else {
3078			// We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3079			// If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3080			// Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3081			if (!found_idx_after_request && (idx_after_request ||
3082				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3083				found_idx = idx;
3084			continue;
3085		}
3086	}
3087	return found_idx;
3088}
3089 
3090static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3091{
3092	if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3093		return 0;
3094	int i, j, k;
3095	int input_size = 0;
3096	for (i = 0; i < p_node_info->p_while.input_size; i++)
3097		if (p_node_info->p_while.inputs[i] >= 0)
3098			++input_size;
3099	// If doesn't have tensor inputs (thus, only special inputs), just return.
3100	if (!input_size)
3101		return 0;
3102	ccv_nnc_tensor_symbol_t inputs[input_size];
3103	input_size = 0;
3104	for (i = 0; i < p_node_info->p_while.input_size; i++)
3105		if (p_node_info->p_while.inputs[i] >= 0)
3106			inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3107				.d = p_node_info->p_while.inputs[i],
3108				.graph = symbolic_graph,
3109			};
3110	assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
 1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
 > 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3110, __extension__ __PRETTY_FUNCTION__
); }));
3111	ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3112	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3113	for (i = 0; i < symbolic_graph->breakpoint_size; i++)
3114	{
3115		// Make a noop copy of the breakpoint, but with some tensor inputs.
3116		ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
3117		ccv_array_push(dup_breakpoints, &noop);
3118		// Connect this noop to the outgoing nodes of breakpoints.
3119		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(symbolic_graph->breakpoints[i].d)));
3120		if (symbol_info->outgoings)
3121			for (j = 0; j < symbol_info->outgoings->rnum; j++)
3122			{
3123				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3124				ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3125					.d = d,
3126					.graph = symbolic_graph,
3127				});
3128			}
3129	}
3130	for (i = 0; i < exec_symbol_info_size; i++)
3131	{
3132		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
3133		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
3134			continue;
3135		if (symbol_info->outgoings)
3136		{
3137			const int outgoing_size = symbol_info->outgoings->rnum;
3138			for (j = 0; j < outgoing_size; j++)
3139			{
3140				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3141				for (k = 0; k < symbolic_graph->breakpoint_size; k++)
3142					if (d == symbolic_graph->breakpoints[k].d)
3143					{
3144						ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)));
3145						ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3146							.d = i,
3147							.graph = symbolic_graph,
3148						}, noop);
3149						// Found, connected, exit.
3150						break;
3151					}
3152			}
3153		}
3154	}
3155	// Add the dup_breakpoints to source if neccessary.
3156	assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3156, __extension__ __PRETTY_FUNCTION__
); }));
3157	const int source_size = symbolic_graph->sources->rnum;
3158	for (i = 0; i < source_size; i++)
3159	{
3160		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i))))->d;
3161		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3162			if (d == symbolic_graph->breakpoints[j].d)
3163			{
3164				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3165				ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3166				// Found, made, exit.
3167				break;
3168			}
3169	}
3170	// Add the dup_breakpoints to destination if neccessary.
3171	assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3171, __extension__ __PRETTY_FUNCTION__); }));
3172	const int destination_size = symbolic_graph->destinations->rnum;
3173	for (i = 0; i < destination_size; i++)
3174	{
3175		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i))))->d;
3176		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3177			if (d == symbolic_graph->breakpoints[j].d)
3178			{
3179				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3180				ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3181				// Found, made, exit.
3182				break;
3183			}
3184	}
3185	return dup_breakpoints;
3186}
3187 
3188// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3189static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3190{
3191	assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); }));
3192	assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
 ({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3192, __extension__ __PRETTY_FUNCTION__
); }));
3193	// First, fill all the "auto" holes.
3194	// This is the symbol table that with "auto" info filled up.
3195	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3196	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3197	ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3198	ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3198, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3198, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3198, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3198, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3198, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3198, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3198, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3198, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3198, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3198, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3198, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3198, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
3199	ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3200	int i, j, k, p, q;
3201	const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
3202	ccv_sparse_matrix_t* exec_dep;
3203	ccv_nnc_tensor_block_t* tensor_blocks;
3204	_ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3205	int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3206	// Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3207	// are automatically filled in, and all the sub-graphs are processed.
3208	// There is a last step though, for a while loop, it is parameterized:
3209	// while (x > 5) {
3210	//     y = x + 1;
3211	// } (y => x) // This means after this loop is done, y's value will be copied over to x.
3212	// we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3213	// If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3214	// it is a inplace operation.
3215	// But if y cannot be x's alias, for example, this while loop looks like this:
3216	// while (x > 5) {
3217	//     y = x + a
3218	//     b = x + y
3219	// } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3220	// For this example, y cannot be x's alias because x is used later to compute b (and that computation
3221	// has dependency on y as well).
3222	// For this case, we need to modify the computation graph. Previously, the graph looks like this:
3223	// y = x + a -> b = x + y
3224	// This graph will be extended to look like this:
3225	// y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3226	// while (x0 > 5) {
3227	//     y0 = x0 + a0
3228	//     b0 = x0 + y0
3229	//     if (y0 > 5) break
3230	//     y1 = y0 + b0
3231	//     b1 = y0 + y1
3232	// } (y1 => x0, b1 => a0)
3233	// After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3234	// with each other now).
3235	// With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3236	// which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3237	ccv_nnc_symbolic_graph_t* dup_graph = 0;
3238	int* dup_exec_ref = 0;
3239	int* dup_tensor_block_ref = 0;
3240	int unroll_count = 0;
3241	// In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3242	ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
3243	prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3244	prep->flags = 0;
3245	// Cannot handle dup a node that is a graph as well.
3246	if (p_exec_symbol_info)
3247	{
3248		prep->flags = p_node_info->flags;
3249		if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3250		{
3251			_ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3252			_ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0))), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3253		} else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3254			// TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3255		}
3256	}
3257	ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
3258	ccv_array_t* anonymous_block_free_list = 0;
3259	const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3260	// Record whether this tensor is folded in this round.
3261	uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
3262	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
3263		for (p = 0; p < node->graph_ref_size; p++)
3264		{
3265			assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 3265, __extension__ __PRETTY_FUNCTION__); }));
3266			ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)));
3267			ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3268			ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0))), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0))), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3269			sub_prep->dup_breakpoints = dup_breakpoints;
3270			sub_prep->p = prep;
3271			sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1] = sub_prep;
3272			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3273			const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3274			for (i = 0; i < s_alloc_prep->block_size; i++)
3275			{
3276				const int block_ref = s_alloc_prep->blocks[i].block_ref;
3277				const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3278				if (block_ref < sub_prep->tensor_symbol_info_size)
3279				{
3280					// If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3281					// I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3282					if (s_tensor_blocks[block_ref].bypass_ref)
3283					{
3284						int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3285						while (s_tensor_blocks[bypass_ref].ref)
3286							bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3287						if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3288							s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3289							continue;
3290					}
3291					if (s_tensor_blocks[block_ref].p_refs[0])
3292					{
3293						/* If it is already properly assigned, next. */
3294						if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3295							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3296						{
3297							if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3298								s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3299							else {
3300								assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3300, __extension__ __PRETTY_FUNCTION__
); }));
3301								s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3302							}
3303						}
3304						/* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3305						if (s_tensor_blocks[block_ref].p_refs[1] &&
3306							s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3307							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3308						{
3309							assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3309, __extension__ __PRETTY_FUNCTION__
); }));
3310							assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3310, __extension__ __PRETTY_FUNCTION__
); }));
3311							s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3312						}
3313					}
3314				} else if (s_tensor_blocks[block_ref].dup_p_refs) {
3315					/* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3316					 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3317					 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3318					 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3319					 * its life-time to the end of the output tensor. */
3320					if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3321						s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3322					for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3323						ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j))));
3324				}
3325			}
3326		}
3327		const int init_tensor_block_size = tensor_block_size;
3328		int rw_anonymous_buffer_size_cap = 0;
3329		int ro_anonymous_buffer_size_cap = 0;
3330		if (anonymous_block_free_list)
3331			ccv_array_clear(anonymous_block_free_list);
3332		memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3333		for (p = 0; p < node->graph_ref_size; p++)
3334		{
3335			ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1];
3336			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3337			int rw_anonymous_buffer_size = 0;
3338			int ro_anonymous_buffer_size = 0;
3339			for (i = 0; i < s_alloc_prep->buffer_size; i++)
3340				if (s_alloc_prep->buffers[i].p_refs[0])
3341				{
3342					/* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3343					int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3344					/* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3345					int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3346					assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3346, __extension__ __PRETTY_FUNCTION__
); }));
3347					int unref_p_ref_0 = p_ref_0;
3348					while (tensor_blocks[unref_p_ref_0].ref)
3349						unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3350					/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3351					assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3351, __extension__ __PRETTY_FUNCTION__); }));
3352					if (s_alloc_prep->buffers[i].p_refs[1])
3353					{
3354						int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3355						const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3356						assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3356, __extension__ __PRETTY_FUNCTION__
); }));
3357						int unref_p_ref_1 = p_ref_1;
3358						while (tensor_blocks[unref_p_ref_1].ref)
3359							unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3360						/* See above comment for the similar p_ref_0 check. */
3361						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3361, __extension__ __PRETTY_FUNCTION__); }));
3362						assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3362, __extension__ __PRETTY_FUNCTION__
); }));
3363						int p_ref_t;
3364						if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3365						{
3366							CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
));
3367							CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t));
3368						}
3369						p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3370						/* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3371						if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
3372						{
3373							const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3374							if (folded)
3375							{
3376								p_ref_0 = p_ref_1;
3377								unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3378								tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3379								for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3380								{
3381									const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3382									assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3382, __extension__ __PRETTY_FUNCTION__
); }));
3383								}
3384							}
3385						}
3386					}
3387					/* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3388					 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3389					 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3390					 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3391					 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3392					 * associated with it, then we are good. */
3393					if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3394						(p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3395						(p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3396						TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3397					{
3398						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3399							{ assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3399, __extension__ __PRETTY_FUNCTION__
); })); }
3400						/* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3401						 * is a long argument why that is the case, the digest is, it is much easier to control your output
3402						 * than your input). */
3403						s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3404						s_alloc_prep->buffers[i].p_refs[1] = 0;
3405						/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3406						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3406, __extension__ __PRETTY_FUNCTION__); }));
3407						tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
 : _b; });
3408						for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3409							tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3410								tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3411									tensor_blocks[unref_p_ref_0].size;
3412					} else {
3413						s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3414						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3415							++ro_anonymous_buffer_size;
3416						else
3417							rw_anonymous_buffer_size += unroll_count + 1;
3418					}
3419				} else {
3420					if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3421						++ro_anonymous_buffer_size;
3422					else
3423						rw_anonymous_buffer_size += unroll_count + 1;
3424				}
3425			if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3426			{
3427				const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3428				// All read-write buffer (potentially) can be reused between each case..of branch.
3429				rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3430				// Read-only buffer cannot be reused between each case..of branch.
3431				ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3432				/* Anonymous block, allocate additional tensor blocks for this. */
3433				/* This is either because this is an internal tensor (don't have p_ref) */
3434				/* or it is an anonymous block itself within the sub graphs of this while graph. */
3435				tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3436				memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3437				if (dup_tensor_block_ref)
3438					dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3439				for (i = 0; i < s_alloc_prep->buffer_size; i++)
3440					if (!s_alloc_prep->buffers[i].p_refs[0])
3441					{
3442						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3443						{
3444							assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap"
, "ccv_nnc_symbolic_graph_compile.c", 3444, __extension__ __PRETTY_FUNCTION__
); }));
3445							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0x10) | ANONYMOUS));
3446							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3447							tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3448							tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3449							tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3450							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3451							tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3452							ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3453							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3454							if (dup_p_refs && dup_p_refs->rnum > 0)
3455							{
3456								for (j = 0; j < dup_p_refs->rnum; j++)
3457								{
3458									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3459									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3459, __extension__ __PRETTY_FUNCTION__
); }));
3460									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3460, __extension__ __PRETTY_FUNCTION__
); }));
3461									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3461, __extension__ __PRETTY_FUNCTION__); }));
3462									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3463									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3464									if (tensor_symbol_info[dup_p_ref].p_ref)
3465									{
3466										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3467										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3467, __extension__ __PRETTY_FUNCTION__); }));
3468										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3469										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3470										{
3471											if (!tensor_blocks[tensor_block_size].dup_p_refs)
3472												tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3473											ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3474										}
3475									}
3476									if (!tensor_blocks[tensor_block_size].tail)
3477										tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3478									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3479										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_size]);
3480								}
3481							} else {
3482								tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3483								ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3484							}
3485							for (j = 0; j < source_size; j++)
3486								_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3487							/* If this is a read-only (based on SSA, if first encountered as read), and this is
3488							 * sub-graph. Mark it to the end of the graph. */
3489							if (p_exec_symbol_info)
3490								for (j = 0; j < destination_size; j++)
3491									_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3492							/* If it is read-only, it is self-reflecting. */
3493							for (k = 0; k < unroll_count; k++)
3494							{
3495								for (j = 0; j < destination_size; j++)
3496									if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3497									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3498								/* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3499								assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
 ({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3499, __extension__ __PRETTY_FUNCTION__
); }));
3500								dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3501							}
3502							++tensor_block_size;
3503						} else {
3504							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3505							const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3506							const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3507							// Find suitable tensor block from the free list.
3508							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3509							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3510							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3511							if (new_anonymous_tensor_block)
3512							{
3513								tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3514								tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3515								tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3516								tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3517								ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3518							} else {
3519								tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3520								tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3521							}
3522							if (dup_p_refs && dup_p_refs->rnum > 0)
3523							{
3524								for (j = 0; j < dup_p_refs->rnum; j++)
3525								{
3526									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3527									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3527, __extension__ __PRETTY_FUNCTION__
); }));
3528									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3528, __extension__ __PRETTY_FUNCTION__
); }));
3529									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3530									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3531									if (tensor_symbol_info[dup_p_ref].p_ref)
3532									{
3533										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3534										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3534, __extension__ __PRETTY_FUNCTION__); }));
3535										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3536										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3537										{
3538											if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3539												tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3540											ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3541										}
3542									}
3543									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3543, __extension__ __PRETTY_FUNCTION__); }));
3544									if (!tensor_blocks[tensor_block_idx].tail)
3545										tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3546									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3547										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_idx]);
3548									// We have to add it to the warp around companion_ref as well.
3549									// TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3550									// be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3551									// definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3552									// gaurantee may be broken down in the line.
3553									if (tensor_blocks[dup_p_ref].companion_ref)
3554									{
3555										const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3556										for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3557											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3558										for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3559											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3560									}
3561								}
3562							} else if (new_anonymous_tensor_block) {
3563								tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3564								ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3565							}
3566							const int prev_tensor_block_idx = tensor_block_idx;
3567							if (new_anonymous_tensor_block)
3568							{
3569								if (!anonymous_block_free_list)
3570									anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3571								ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3572								++tensor_block_size;
3573							}
3574							for (k = 0; k < unroll_count; k++)
3575							{
3576								const int tensor_block_idx = new_anonymous_tensor_block ?
3577									(dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3578									dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3579								TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3580								TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3581								if (new_anonymous_tensor_block)
3582								{
3583									tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3584									tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3585									tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3586									tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3587									/* Attach to duplicated exec for this tensor block. */
3588									ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3589								} else {
3590									tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3591									tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3592									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3593 
3594								}
3595								if (dup_p_refs && dup_p_refs->rnum > 0)
3596								{
3597									/* Not nil, not self-reflecting. */
3598									for (j = 0; j < dup_p_refs->rnum; j++)
3599									{
3600										const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3601										assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3601, __extension__ __PRETTY_FUNCTION__
); }));
3602										assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3602, __extension__ __PRETTY_FUNCTION__
); }));
3603										// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3604										// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3605										if (tensor_symbol_info[dup_p_ref].p_ref)
3606										{
3607											const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3608											assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3608, __extension__ __PRETTY_FUNCTION__); }));
3609											const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3610											if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3611											{
3612												if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3613													tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3614												ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3615											}
3616										}
3617										assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
 ("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3617, __extension__ __PRETTY_FUNCTION__
); }));
3618										const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3619										assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
 __extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
 __assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3619, __extension__ __PRETTY_FUNCTION__); }));
3620										if (!tensor_blocks[tensor_block_idx].tail)
3621											tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3622										for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3623											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3624										// We have to add it to the warp around companion_ref as well.
3625										if (tensor_blocks[dup_dup_p_ref].companion_ref)
3626										{
3627											const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3628											for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3629												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3630											for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3631												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3632										}
3633									}
3634								} else if (new_anonymous_tensor_block) {
3635									tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3636									ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3637								}
3638								if (new_anonymous_tensor_block)
3639									++tensor_block_size;
3640							}
3641						}
3642					}
3643			}
3644		}
3645	} ccv_nnc_graph_visit_endfor} }
3646	if (anonymous_block_free_list)
3647		ccv_array_free(anonymous_block_free_list);
3648	ccfreefree(tensor_fold);
3649	// It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3650	// the allocation dependencies, thus, which tensor is reused to the existing tensor.
3651	ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3652	prep->while_count_tensor = 0;
3653	prep->dup_breakpoints = 0;
3654	prep->p = 0;
3655	prep->symbolic_graph = symbolic_graph;
3656	prep->p_idx = symbolic_graph->p_idx;
3657	prep->exec_idx = symbolic_graph->exec_idx;
3658	prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3659	prep->sub_preps = sub_preps;
3660	prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3661	prep->exec_symbol_info = exec_symbol_info;
3662	prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3663	prep->tensor_symbol_info = tensor_symbol_info;
3664	prep->unroll_count = unroll_count;
3665	prep->dup_tensor_block_ref = dup_tensor_block_ref;
3666	prep->tensor_block_size = tensor_block_size;
3667	prep->tensor_blocks = tensor_blocks;
3668	prep->exec_flags = exec_flags;
3669	prep->visit = visit;
3670	prep->alloc_prep = alloc_prep;
3671	if (dup_graph)
3672		ccv_nnc_symbolic_graph_free(dup_graph);
3673	if (dup_exec_ref)
3674		ccfreefree(dup_exec_ref);
3675	return prep;
3676}
3677 
3678static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3679{
3680	int i;
3681	_ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3682	ccfreefree(prep->exec_flags);
3683	for (i = 0; i < prep->sub_prep_size; i++)
3684		if (prep->sub_preps[i])
3685			_ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3686	if (prep->sub_preps)
3687		ccfreefree(prep->sub_preps);
3688	ccfreefree(prep->tensor_symbol_info);
3689	ccfreefree(prep->exec_symbol_info);
3690	if (prep->dup_tensor_block_ref)
3691		ccfreefree(prep->dup_tensor_block_ref);
3692	_ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3693	ccv_nnc_graph_visit_free(prep->visit);
3694	ccfreefree(prep);
3695}
3696 
3697static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3698{
3699	int i, j;
3700	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
3701		if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3702		{
3703			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3704			assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3704, __extension__ __PRETTY_FUNCTION__
); }));
3705			ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3706			for (i = 0; i < node->p_while.input_size; i++)
3707				if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3708				{
3709					ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3710					const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3711					for (j = 0; j < d; j++)
3712						prep = prep->p;
3713					prep->while_count_tensor = 1;
3714				}
3715		}
3716		for (i = 0; i < node->graph_ref_size; i++)
3717		{
3718			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3719			if (graph_ref >= 0)
3720				_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3721		}
3722	} ccv_nnc_graph_visit_endfor} }
3723}
3724 
3725static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3726{
3727	if (symbol >= 0)
3728		return graph_prep->tensor_arena->vt_tensors[symbol];
3729	if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3730		return 0;
3731	assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
 : 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
 0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3731, __extension__ __PRETTY_FUNCTION__
); }));
3732	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3733	int i;
3734	const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3735	for (i = 0; i < d; i++)
3736		prep = prep->p;
3737	assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
 ({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3737, __extension__ __PRETTY_FUNCTION__
); }));
3738	return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3739}
3740 
3741static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3742{
3743	int i;
3744	int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3745	ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3746	graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3747	graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3748	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3749	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3750		if (graph_execs[i].graph == graph)
3751			graph_execs[i].d = exec_cvt[graph_execs[i].d];
3752	ccfreefree(exec_cvt);
3753}
3754 
3755static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3756{
3757	int i, j, k;
3758	ccv_nnc_graph_t* const graph = graph_prep->graph;
3759	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3760	ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
3761	graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3762	graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3763	graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3764	graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3765	memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3766	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3767	int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3768	for (i = 0; i < exec_symbol_info_size; i++)
3769	{
3770		max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; });
3771		max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; });
3772		if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3773			max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
 : _b; });
3774		graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3775		graph_execs[i].graph = 0;
3776	}
3777	for (i = 0; i < graph_prep->sub_prep_size; i++)
3778		max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
 ((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; });
3779	ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
3780	ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
3781	ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })];
3782	const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3783	const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3784	// Create node, this is in topological order.
3785	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
3786		if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
3787		{
3788			for (i = 0; i < node->input_size; i++)
3789				max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3790			for (i = 0; i < node->output_size; i++)
3791				max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3792			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3793			{
3794				const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3795				assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3795, __extension__ __PRETTY_FUNCTION__
); }));
3796				ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3797				ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3798				graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3799				const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3800				ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3801				ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3802				for (i = 0; i < node->p_while.input_size; i++)
3803					max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3804				for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3805					max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3806				ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3807				_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3808			} else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3809				for (i = 0; i < node->output_size; i++)
3810					if (max_outputs[i] && max_outputs[i]->alias_ref)
3811						max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3812				graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3813				// Check whether this is already covered in the inputs, if not, need to be covered in the update.
3814				for (i = 0; i < node->case_of.argument.offset; i++)
3815				{
3816					ccv_nnc_tensor_t* const update = max_inputs[i];
3817					if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3818						continue;
3819					int flag = 0;
3820					for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3821						flag = (update == max_inputs[j]);
3822					if (!flag)
3823						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3824				}
3825				const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3826				ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3827				if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3828				{
3829					// Add another graph for data transfer.
3830					ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3831					for (i = 0; i < node->output_size; i++)
3832						max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3833					ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }), max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }));
3834					ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3835					ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3836					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3837					int exec_cvt;
3838					ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3839				}
3840				for (i = 0; i < node->graph_ref_size; i++)
3841				{
3842					const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3843					if (graph_ref < 0)
3844						continue;
3845					ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3846					const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3847					ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3848					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3849					_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3850				}
3851			} else {
3852				graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3853			}
3854			ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3855		}
3856	} ccv_nnc_graph_visit_endfor} }
3857	// Then connect them.
3858	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
3859		if (node->outgoings)
3860			for (i = 0; i < node->outgoings->rnum; i++)
3861			{
3862				const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
3863				if (graph_execs[outgoing].graph)
3864					ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3865			}
3866	} ccv_nnc_graph_visit_endfor} }
3867	int source_exec_created = 0;
3868	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3869	const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3870	ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3871	// After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3872	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
3873	{
3874		if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
3875		{
3876			int ref = i;
3877			while (tensor_symbol_info[ref].alias_ref)
3878				ref = tensor_symbol_info[ref].alias_ref - 1;
3879			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
3880				ref = tensor_blocks[ref].ref - 1;
3881			// This is not computable. It could be that we marked a const tensor as init zero.
3882			if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)))
3883				continue;
3884			// If this tensor is not used by any exec, we don't need to init at all. Skip.
3885			if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3886				continue;
3887			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3888			// Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3889			ccv_nnc_graph_exec_t set_exec;
3890			if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3891				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={0,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3892			else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3893				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={1,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3894			for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
3895			{
3896				const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)));
3897				if (outgoing >= exec_symbol_info_size)
3898					continue;
3899				assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
 if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3899, __extension__ __PRETTY_FUNCTION__
); }));
3900				assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3900, __extension__ __PRETTY_FUNCTION__
); }));
3901				ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3902			}
3903			int flags = 0;
3904			if (alloc_dep[ref])
3905				for (j = 0; j < alloc_dep[ref]->rnum; j++)
3906				{
3907					const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)));
3908					// This is from alloc_dep, it should be computable.
3909					assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3909, __extension__ __PRETTY_FUNCTION__
); }));
3910					if (tensor_blocks[d].tail)
3911						for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
3912						{
3913							const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
3914							if (incoming >= exec_symbol_info_size)
3915								continue;
3916							assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
 if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3916, __extension__ __PRETTY_FUNCTION__
); }));
3917							assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3917, __extension__ __PRETTY_FUNCTION__
); }));
3918							ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3919							flags = 1;
3920						}
3921				}
3922			// If cannot find a start node for this exec, we need to append it to the no-op of the start.
3923			if (!flags)
3924			{
3925				if (!source_exec_created)
3926				{
3927					graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3928					source_exec_created = 1;
3929				}
3930				ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3931			}
3932		}
3933	}
3934	// Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3935	// (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3936	// with its alias).
3937	assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3937, __extension__ __PRETTY_FUNCTION__
); }));
3938	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3939	{
3940		ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3941		// If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3942		if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3943		{
3944			const ccv_array_t* const head = tensor_blocks[i].head;
3945			if (head && head->rnum > 0)
3946				for (j = 0; j < head->rnum; j++)
3947				{
3948					const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(j)));
3949					if (idx >= exec_symbol_info_size)
3950						continue;
3951					assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3951, __extension__ __PRETTY_FUNCTION__); }));
3952					const int d = graph_execs[idx].d;
3953					ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)));
3954					int flag = 0;
3955					if (exec_info->tensor_wraps_ref)
3956					{
3957						ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)));
3958						for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3959							flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3960					}
3961					// If none is in the flag, it need to be included in the cast.
3962					if (!flag)
3963						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3964				}
3965		}
3966	}
3967	// Create source / destination phony node. This is to facilitate use of compiled graph.
3968	// Also, this is needed if you have init zero execs.
3969	if (source_exec_created || source_size > 1)
3970	{
3971		if (!source_exec_created)
3972			graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3973		for (i = 0; i < source_size; i++)
3974			ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3975	} else {
3976		assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
 ({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3976, __extension__ __PRETTY_FUNCTION__
); }));
3977		assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
 if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3977, __extension__ __PRETTY_FUNCTION__
); }));
3978		graph_exec_arena->source = graph_execs[sources[0].d];
3979	}
3980	if (destination_size == 1)
3981		graph_exec_arena->destination = graph_execs[destinations[0].d];
3982	else {
3983		graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3984		for (i = 0; i < destination_size; i++)
3985			ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3986	}
3987	ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3988	ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3989	return graph_exec_arena;
3990}
3991 
3992static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3993{
3994	if (graph_prep->symbolic_graph == pair)
3995		return graph_prep->graph;
3996	int i;
3997	for (i = 0; i < graph_prep->sub_prep_size; i++)
3998		if (graph_prep->sub_preps[i])
3999		{
4000			ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
4001			if (graph)
4002				return graph;
4003		}
4004	return 0;
4005}
4006 
4007static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4008{
4009	int i;
4010	for (i = 0; i < graph_prep->sub_prep_size; i++)
4011		if (graph_prep->sub_preps[i])
4012		{
4013			if (graph_prep->sub_preps[i]->symbolic_graph->pair)
4014				graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
4015		}
4016}
4017 
4018static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4019{
4020	assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4020, __extension__ __PRETTY_FUNCTION__
); }));
4021	int i;
4022	for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
4023	{
4024		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
))
4025			continue;
4026		if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref)
4027		{
4028			ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4029				.d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4030				.graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph,
4031			});
4032			if (pair_exec.d >= 0)
4033				ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4034		}
4035	}
4036	for (i = 0; i < graph_prep->sub_prep_size; i++)
4037		if (graph_prep->sub_preps[i])
4038			_ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4039}
4040 
4041static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4042{
4043	int i;
4044	if (graph_prep->dup_breakpoints)
4045	{
4046		// Strip the const modifier only possible because it is a sub-graph.
4047		ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4048		for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
4049			ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
 + (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i))));
4050		ccv_array_free(graph_prep->dup_breakpoints);
4051		graph_prep->dup_breakpoints = 0;
4052		graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4053		// Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4054		memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4055		// Since exec_symbol_info changed, create a new visit object.
4056		assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 4056, __extension__ __PRETTY_FUNCTION__
); }));
4057		assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 4057, __extension__ __PRETTY_FUNCTION__); }));
4058		ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)));
4059		const int source_size = symbolic_graph->sources->rnum;
4060		ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)));
4061		const int destination_size = symbolic_graph->destinations->rnum;
4062		ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4062, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4062, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4062, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
4063		ccv_nnc_graph_visit_free(graph_prep->visit);
4064		graph_prep->visit = visit;
4065		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 4065, __extension__ __PRETTY_FUNCTION__
); }));
4066		ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4067	}
4068	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
4069		for (i = 0; i < node->graph_ref_size; i++)
4070		{
4071			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
4072			if (graph_ref >= 0)
4073				_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4074		}
4075	} ccv_nnc_graph_visit_endfor} }
4076}
4077 
4078const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4079 
4080void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4081{
4082	assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 4082, __extension__ __PRETTY_FUNCTION__); }));
4083	assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
 if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4083, __extension__ __PRETTY_FUNCTION__
); }));
4084	assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
 ({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4084, __extension__ __PRETTY_FUNCTION__
); }));
4085	int i;
4086	// Cannot bind the multi-view.
4087	for (i = 0; i < tensor_bind_size; i++)
4088	{
4089		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 4089, __extension__ __PRETTY_FUNCTION__
); }));
4090		assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 4090, __extension__ __PRETTY_FUNCTION__
); }));
4091	}
4092	ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4093	_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4094	ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4095	_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4096	*tensor_arena_ref = tensor_arena;
4097	// The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4098	_ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4099	// Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4100	_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4101	*graph_ref = graph_prep->graph;
4102	ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4103	_ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4104	_ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4105	*graph_exec_arena_ref = graph_exec_arena;
4106	_ccv_nnc_symbolic_graph_prep_free(graph_prep);
4107}
4108 
4109static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4110{
4111	// Buffers are inherited from above, no need to dealloc.
4112	int i;
4113	for (i = 0; i < tensor_arena->sub_arena_size; i++)
4114		if (tensor_arena->sub_arenas[i])
4115			_ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4116	for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
4117	{
4118		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
 (size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i))));
4119		assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 4119, __extension__ __PRETTY_FUNCTION__
); }));
4120		ccv_nnc_tensor_multiview_free(*mv);
4121	}
4122	ccv_array_free(tensor_arena->tensor_metadata);
4123	ccv_array_free(tensor_arena->m_tensor_idx);
4124	if (tensor_arena->pb_vt_tensors)
4125		ccfreefree(tensor_arena->pb_vt_tensors);
4126	if (tensor_arena->vt_alias_r_refs_p)
4127		ccfreefree(tensor_arena->vt_alias_r_refs_p);
4128	if (tensor_arena->vt_sizes)
4129		ccfreefree(tensor_arena->vt_sizes);
4130	ccfreefree(tensor_arena);
4131}
4132 
4133void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4134{
4135	assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
 == (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 4135, __extension__ __PRETTY_FUNCTION__
); }));
1
Assuming field 'graph_ref' is equal to field 'graph'→
2
←
Taking true branch→
4136	assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4136, __extension__ __PRETTY_FUNCTION__
); }));
3
←
Assuming field 'd' is < field 'vt_tensor_size'→
4
←
Taking true branch→
4137	assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({
 if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 4137, __extension__ __PRETTY_FUNCTION__
); }));
5
←
Assuming field 'd' is >= 0→
6
←
Taking true branch→
4138	// Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4139	int i;
4140	if (!tensor_arena->pb_vt_tensors)
7
←
Assuming field 'pb_vt_tensors' is null→
8
←
Taking true branch→
4141	{
4142		tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4143		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
9
←
Assuming 'i' is < field 'vt_tensor_size'→
10
←
Loop condition is true.  Entering loop body→
13
←
Assuming 'i' is >= field 'vt_tensor_size'→
14
←
Loop condition is false. Execution continues on line 4147→
4144			if (tensor_arena->vt_tensors[i])
11
←
Assuming pointer value is null→
12
←
Taking false branch→
4145				tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4146	}
4147	if (!tensor_arena->vt_alias_r_refs_p)
15
←
Assuming field 'vt_alias_r_refs_p' is non-null→
4148	{
4149		tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4150		tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4151		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4152			if (tensor_arena->vt_alias_refs[i])
4153			{
4154				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4155				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__
); }));
4156				++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4157			}
4158		int refp = 0;
4159		for (i = 0; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4160			if (tensor_arena->vt_alias_r_refs_p[i])
4161				refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4162			else
4163				tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4164		for (i = refp; i < tensor_arena->vt_tensor_size; i++)
4165			tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4166		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4167			if (tensor_arena->vt_alias_refs[i])
4168			{
4169				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4170				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4170, __extension__ __PRETTY_FUNCTION__
); }));
4171				const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4172				assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if (
pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 4172, __extension__ __PRETTY_FUNCTION__); }));
4173				tensor_arena->vt_alias_r_refs[pos] = i;
4174			}
4175	}
4176	const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d;
16
←
Taking false branch→
17
←
Assuming the condition is false→
18
←
'?' condition is false→
19
←
'symbol_d' initialized to 0→
4177	if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
20
←
Assuming the condition is false→
21
←
Taking false branch→
4178	{
4179		assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor
)->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4179, __extension__ __PRETTY_FUNCTION__
); })); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4180		assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4182, __extension__ __PRETTY_FUNCTION__
); }))
4181					ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4182, __extension__ __PRETTY_FUNCTION__
); }))
4182				(size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4182, __extension__ __PRETTY_FUNCTION__
); }));
4183	} else
4184		{ assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4184, __extension__ __PRETTY_FUNCTION__
); })); }
22
←
Access to field 'info' results in a dereference of a null pointer
4185	if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW
))
4186		{ assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors
[symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t
*)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else
 __assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4186, __extension__ __PRETTY_FUNCTION__
); })); }
4187	tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4188	if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4189		for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++)
4190		{
4191			const int d = tensor_arena->vt_alias_r_refs[i];
4192			if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it.
4193				break;
4194			ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4195			d_tensor->info.datatype = tensor->info.datatype;
4196			d_tensor->info.reserved = tensor->info.reserved;
4197			if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW))
4198				ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4199			else {
4200				d_tensor->data.u8 = tensor->data.u8;
4201				d_tensor->dataof = tensor->dataof;
4202			}
4203		}
4204}
4205 
4206void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4207{
4208	if (!tensor_arena->pb_vt_tensors)
4209		return;
4210	int i;
4211	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4212		if (tensor_arena->vt_tensors[i])
4213			tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4214}
4215 
4216uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4217{
4218	uint64_t total_size = 0;
4219	int i;
4220	for (i = 0; i < tensor_arena->buffer_size; i++)
4221		total_size += tensor_arena->buffers[i].size;
4222	return total_size;
4223}
4224 
4225static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4226{
4227	int i;
4228	if (mv->it)
4229		mv->it->info = params;
4230	for (i = 0; i < mv->repeat + mv->kind; i++)
4231	{
4232		ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
4233		if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4234			_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4235		else
4236			tensor->info = params;
4237	}
4238}
4239 
4240int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4241{
4242	int i;
4243	assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph->
tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size
) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4243, __extension__ __PRETTY_FUNCTION__
); }));
4244	if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4245	{
4246		tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4247		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4248			if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4249			{
4250				ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4251				if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4252				{
4253					ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4254					while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
4255						mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
4256					tensor = (ccv_nnc_tensor_t*)mv;
4257				}
4258				tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4259			}
4260	}
4261	int flag = 0;
4262	for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++)
4263		if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4264		{
4265			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4266			ccv_nnc_tensor_param_t params = symbol_info->info;
4267			params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4268			params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4269			flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4270		}
4271	if (flag)
4272		return -1;
4273	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4274		if (tensor_arena->vt_tensors[i])
4275		{
4276			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4277			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4278			if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4279			{
4280				assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0)
, __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else
 __assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c"
, 4280, __extension__ __PRETTY_FUNCTION__); }));
4281				_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4282			} else if (!tensor_arena->vt_alias_refs[i]) {
4283				ccv_nnc_tensor_param_t params = symbol_info->info;
4284				params.datatype = tensor->info.datatype;
4285				params.reserved = tensor->info.reserved;
4286				tensor->info = params;
4287			} else {
4288				off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4289				ccv_nnc_tensor_param_t params = symbol_info->info;
4290				params.datatype = tensor->info.datatype;
4291				params.reserved = tensor->info.reserved;
4292				tensor->info = params;
4293				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4294				ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4295				if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4296				{
4297					((ccv_nnc_tensor_view_t*)tensor)->off = off;
4298					memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4299				}
4300			}
4301		}
4302	// Should handle sub_tensor_arena, don't do that at the moment.
4303	assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs"
, "ccv_nnc_symbolic_graph_compile.c", 4303, __extension__ __PRETTY_FUNCTION__
); }));
4304	return 0;
4305}
4306 
4307void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4308{
4309	assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum
 >= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__
 ({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena
->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 4309, __extension__ __PRETTY_FUNCTION__
); }));
4310	int i;
4311	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
4312	{
4313		const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4314		if (graph_exec.d < 0)
4315			continue;
4316		const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4317		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
4318		ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4319		if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4320		{
4321			new_cmd.backend = existing_cmd.backend;
4322			new_cmd.algorithm = existing_cmd.algorithm;
4323		}
4324		ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4325	}
4326}
4327 
4328void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4329{
4330	int i;
4331	for (i = 0; i < tensor_arena->buffer_size; i++)
4332	{
4333		if (!tensor_arena->buffers[i].ptr)
4334			continue;
4335		const int buffer_type = tensor_arena->buffers[i].type;;
4336		const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
4337#ifdef HAVE_CUDA1
4338		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4339		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4340		{
4341			if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4342				tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4343			else
4344				cufree(device_id, tensor_arena->buffers[i].ptr);
4345		} else {
4346			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4346, __extension__ __PRETTY_FUNCTION__
); }));
4347			if (tensor_arena->buffers[i].pin_mem)
4348				cuhostfree(tensor_arena->buffers[i].ptr);
4349			else
4350				ccfreefree(tensor_arena->buffers[i].ptr);
4351		}
4352#elif defined(HAVE_MPS)
4353		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4354		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4355		{
4356			// if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4357			// 	tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4358			// else
4359			mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4360		} else {
4361			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4361, __extension__ __PRETTY_FUNCTION__
); }));
4362			ccfreefree(tensor_arena->buffers[i].ptr);
4363		}
4364#else
4365		assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4365, __extension__ __PRETTY_FUNCTION__
); }));
4366		ccfreefree(tensor_arena->buffers[i].ptr);
4367#endif
4368		tensor_arena->buffers[i].ptr = 0;
4369	}
4370	// For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4371	if (tensor_arena->disposers)
4372	{
4373		for (i = 0; i < tensor_arena->disposers->rnum; i++)
4374		{
4375			ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + (
size_t)(tensor_arena->disposers)->rsize * (size_t)(i)));
4376			disposer->dispose(disposer->ptr, disposer->userdata);
4377		}
4378		ccv_array_free(tensor_arena->disposers);
4379		tensor_arena->disposers = 0;
4380	}
4381}
4382 
4383void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4384{
4385	ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4386	_ccv_nnc_tensor_arena_free(tensor_arena);
4387}
4388 
4389void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4390{
4391	int i;
4392	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
4393		if (graph_exec_arena->sub_arenas[i])
4394			ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4395	ccfreefree(graph_exec_arena);
4396}