ccv_nnc_symbolic_graph

Bug Summary

File:	nnc/ccv_nnc_symbolic_graph_compile.c
Warning:	line 4184, column 6 Dereference of null pointer
Annotated Source Code

Press '?' to see keyboard shortcuts
Show analyzer invocation
clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2026-05-17-121939-2670738-1 -x c ccv_nnc_symbolic_graph_compile.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "ccv_nnc_easy.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include "_ccv_nnc_graph.h"
11#include "_ccv_nnc_symbolic_graph.h"
12 
13// MARK - Level-3 API
14 
15typedef struct {
16	int flags;
17	int type;
18	int pin_mem; // This memory need to be pinned.
19	int ref; // Reference to another tensor block. Start with 1.
20	int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21	int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22	int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23	int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24	ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25	uint64_t size; // The size of the tensor expected.
26	int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27	ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28	ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29	ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30} ccv_nnc_tensor_block_t; // Tensor Arena Block
31 
32#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
33 
34enum {
35	UNASSIGNED = 0x1,
36	ALIAS = 0x2,
37	READ_ONLY = 0x4,
38	WRITE_ONLY = 0x8,
39	READ_WRITE = 0xc,
40	ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41	UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42	UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43};
44 
45#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
46#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
47#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
48#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
50#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
51#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED)) (!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
52#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
53#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
54#define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
56#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
58#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
60 
61#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
 & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
62 
63// Holds additional information about the exe nodes.
64typedef struct {
65	int flags;
66} ccv_nnc_graph_exec_flag_t;
67 
68enum {
69	CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70};
71 
72typedef struct {
73	int index;
74	int oc;
75	int type;
76	uint64_t size;
77} ccv_nnc_tensor_opt_t;
78 
79// We first sort the same type together (because they won't be reused at all.
80// And then we sort by size, after that, sort by oc.
81#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
 *array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
 t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
83#undef more_than
84typedef struct {
85	int idx;
86	int hop;
87} ccv_nnc_tensor_hop_t;
88#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t
 total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t
 t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t
* left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t
* ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && less_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t
* left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0
; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t
* a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux
) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b
, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? (
less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a)
) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = less_than
(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, *
c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than
(*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !less_than(*pivot, *left
, aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !less_than(*right, *pivot, aux) ) { if( !less_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
90#undef less_than
91 
92// If b has items overlap with a, a is still after b (inclusive).
93static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94{
95	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__
); }));
96	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__
); }));
97	int x, y;
98	for (x = 0; x < b->rnum; x++)
99	{
100		const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)));
101		int flag = 0;
102		// In extreme cases where a is a superset of b, then a is still after b, we are good.
103		for (y = 0; !flag && y < a->rnum; y++)
104		{
105			const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)));
106			flag = (p == q);
107		}
108		if (!flag)
109			for (y = 0; y < a->rnum; y++)
110			{
111				ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y))), p);
112				if (!cell.i32 || cell.i32[0] == 0)
113					return 0;
114			}
115	}
116	// If b->rnum == 0, a is after b for sure.
117	// Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118	// if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119	return (a->rnum > 0 || b->rnum == 0);
120}
121 
122static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123{
124	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__
 __PRETTY_FUNCTION__); }));
125	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__
 __PRETTY_FUNCTION__); }));
126	if (!a->rnum || !b->rnum)
127		return 0;
128	int x, y, max_hop = 0;
129	for (x = 0; x < a->rnum; x++)
130	{
131		ccv_sparse_matrix_vector_t* const vector = ccv_get_sparse_matrix_vector(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x))));
132		if (!vector)
133			return 0;
134		for (y = 0; y < b->rnum; y++)
135		{
136			const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y))));
137			if (!cell.i32 || cell.i32[0] == 0)
138				return 0;
139			if (cell.i32[0] > max_hop)
140				max_hop = cell.i32[0];
141		}
142	}
143	// We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
144	// The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
145	return max_hop;
146}
147 
148// If every a's head is deterministically after b's tail
149static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
150{
151	return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
152}
153 
154typedef struct {
155	ccv_array_t** alloc_dep;
156	int vt_block_size;
157	int buffer_size;
158	int block_size;
159	int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
160	struct {
161		int type; // The type from tensor blocks.
162		int pin_mem; // Whether this is pinned memory.
163		int flags; // The flags (currently for READ_ONLY or not).
164		uint64_t size; // The size of the buffer allocated.
165		int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
166		ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
167	}* buffers;
168	struct {
169		int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
170		int block_ref; // A reference to which block in the given tensor_block to use.
171		uint64_t offset; // The offset of this block.
172	}* blocks;
173} ccv_nnc_tensor_alloc_prep_t;
174 
175typedef struct ccv_nnc_symbolic_graph_prep_s {
176	int flags;
177	int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
178	int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
179	int exec_idx;
180	int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
181	int tensor_symbol_info_size;
182	int exec_symbol_info_size;
183	int tensor_block_size;
184	int sub_prep_size;
185	ccv_nnc_tensor_block_t* tensor_blocks;
186	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
187	ccv_nnc_graph_exec_flag_t* exec_flags;
188	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
189	int* dup_tensor_block_ref;
190	ccv_nnc_graph_visit_t* visit;
191	ccv_nnc_tensor_alloc_prep_t* alloc_prep;
192	struct ccv_nnc_symbolic_graph_prep_s* p;
193	struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
194	// Structures that don't require to be freed after deallocation.
195	const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
196	ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
197	ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
198	ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
199} ccv_nnc_symbolic_graph_prep_t;
200 
201typedef struct {
202	int oc;
203	ccv_array_t* itf;
204} ccv_nnc_tensor_block_adjacent_t;
205 
206static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
207{
208	// Compute how many dis-continuous buffers are needed.
209	// We prefer to have several dis-continuous buffers instead of one big buffer because
210	// in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
211	// to fully utilize memory.
212	int i, j, k;
213	ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
214	int allocable_tensor_size = 0, available_tensor_size = 0;
215	for (i = 0; i < tensor_block_size; i++)
216		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
217		{
218			// Tensors that we need the header info.
219			++available_tensor_size;
220			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
221				// Tensors that we actually need to allocate (exclude the alias).
222				++allocable_tensor_size;
223		}
224	ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
225	ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
226	ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
227	// Overlap count.
228	for (i = 0; i < tensor_block_size; i++)
229		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
230			for (j = i + 1; j < tensor_block_size; j++)
231				if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED)))
232				{
233					// We only reuse buffers within the same memory type. The tensor_dt / tensor_df
234					// matrices are only queried later for same-type candidates in this function,
235					// thus cross-type hop relations are not needed for allocation planning here.
236					if (tensor_blocks[i].type != tensor_blocks[j].type)
237						continue;
238					// Check to see if they interfere (default to yes).
239					// If any of the i's head is deterministically later than j's tail
240					// or any of the i's tail is deterministically earlier than j's head, they don't interfere.
241					const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
242					int j_hop_i = 0;
243					if (i_hop_j > 0)
244					{
245						ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
246						ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
247					} else {
248						// It cannot be that both directions are positive. If i can hop to j, we don't
249						// need the reverse hop value for any subsequent allocation decision.
250						j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
251						if (j_hop_i > 0)
252						{
253							ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
254							ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
255						}
256					}
257					if (!i_hop_j && !j_hop_i)
258					{
259						if (!adj[i].itf)
260							adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
261						ccv_array_push(adj[i].itf, &j);
262						++adj[i].oc;
263						if (!adj[j].itf)
264							adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
265						ccv_array_push(adj[j].itf, &i);
266						++adj[j].oc;
267					}
268				}
269	const int exec_dep_rows = exec_dep->rows;
270	ccv_matrix_free(exec_dep);
271	ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
272	int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
273	uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
274	uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
275	uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
276	int num_assigned = 0; 
277	// I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
278	// Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
279	// The first channel denotes the bytes available for allocation,
280	// the second channel denotes the offset available for the allocation,
281	ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
282	ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
283	for (j = 0; j < allocable_tensor_size;)
284	{
285		// Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
286		uint64_t max_size = 0;
287		ccv_array_clear(opt);
288		int current_type = 0; // Deal with one type at a time.
289		for (i = 0; i < tensor_block_size; i++)
290			if (tensor_blocks[i].size >= max_size &&
291				TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && !assigned[i] &&
292				IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
293				(!current_type || tensor_blocks[i].type == current_type))
294			{
295				ccv_nnc_tensor_opt_t a = {
296					.size = tensor_blocks[i].size,
297					.index = i,
298					.oc = adj[i].oc,
299					.type = tensor_blocks[i].type,
300				};
301				assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 301, __extension__ __PRETTY_FUNCTION__); }));
302				current_type = a.type; // Now we now the primary type we should deal with.
303				if (tensor_blocks[i].companion_ref)
304				{
305					const int companion_ref = tensor_blocks[i].companion_ref - 1;
306					a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; });
307					a.oc += adj[companion_ref].oc;
308				}
309				// In case we have a tie, take them all in the array.
310				if (a.size > max_size)
311					ccv_array_clear(opt), max_size = a.size;
312				ccv_array_push(opt, &a);
313			}
314		assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
 ({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 314, __extension__ __PRETTY_FUNCTION__
); }));
315		// Order opt array by the oc because type and size should be equal at this point.
316		_ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
317		// Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
318		int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
319		uint64_t min_val[2] = {
320			0, 0
321		};
322		if (j > 0)
323		{
324			for (i = 0; i < opt->rnum; i++)
325			{
326				ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(i)));
327				if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
328					continue;
329				// Now, determine the order between a and c. After this, we can always check whether y
330				// can hop to the earliest one and if the latest one can hop to x.
331				// The earliest one will be called p and the latest one will be called q.
332				int p = a.index;
333				int q = a.index;
334				if (tensor_blocks[a.index].companion_ref)
335				{
336					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
337					if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
338						continue;
339					const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
340					if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
341						p = companion_ref;
342					else {
343						const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
344						if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
345							q = companion_ref;
346						else { // Otherwise, b is in between p and q.
347							const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
348							const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
349							assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
 && b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
 : 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
 0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 349, __extension__ __PRETTY_FUNCTION__
); }));
350						}
351					}
352				}
353				assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 353, __extension__ __PRETTY_FUNCTION__
); }));
354				const int type = tensor_blocks[p].type;
355				// y is always earlier than x, but this is hard to assert now.
356				// If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
357				// Thus, the hop between y and x (through a) should be smallest ones.
358				// We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
359				// out of q. For these nodes, we try to verify whether they form a connection (by checking against
360				// alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
361				int y_size = 0;
362				ccv_nnc_tensor_hop_t* const y_buf = buf;
363#define for_block(y, val) do { \
364					if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \
365						y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
366							.idx = y + 1, .hop = ((int*)val)[0] \
367						}; \
368				} while(0)
369				ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
370				if (y_vector)
371					CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
372#undef for_block
373				assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
 ({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 373, __extension__ __PRETTY_FUNCTION__); }));
374				int x_size = 0;
375				ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
376#define for_block(x, val) do { \
377					if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \
378						x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
379							.idx = x + 1, .hop = ((int*)val)[0] \
380						}; \
381				} while(0)
382				ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
383				if (x_vector)
384					CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
385#undef for_block
386				assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
 : 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 386, __extension__ __PRETTY_FUNCTION__
); }));
387				int x, y;
388				if (y_size > 1)
389					_ccv_nnc_sort_by_hops(y_buf, y_size, 0);
390				for (y = 0; y < y_size; y++)
391				{
392					const int hop = exec_dep_rows + y_buf[y].hop;
393					if (hop >= min_hop)
394						break;
395					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
396					if (val.u64 && val.u64[0] >= a.size)
397					{
398						min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
399							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
400						break;
401					}
402				}
403				if (x_size > 1)
404					_ccv_nnc_sort_by_hops(x_buf, x_size, 0);
405				for (x = 0; x < x_size; x++)
406				{
407					const int hop = exec_dep_rows + x_buf[x].hop;
408					if (hop >= min_hop)
409						break;
410					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
411					if (val.u64 && val.u64[0] >= a.size)
412					{
413						min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
414							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
415						break;
416					}
417				}
418				if (x_size > 0)
419				{
420					const int x_min_hop = x_buf[0].hop;
421					for (y = 0; y < y_size; y++)
422					{
423						const int y_hop_p_v = y_buf[y].hop;
424						if (y_hop_p_v + x_min_hop >= min_hop)
425							break;
426						ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
427						if (y_vector)
428						{
429							for (x = 0; x < x_size; x++)
430							{
431								const int q_hop_x_v = x_buf[x].hop;
432								const int hop = y_hop_p_v + q_hop_x_v;
433								if (hop >= min_hop)
434									break;
435								const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
436								if (val.u64 && val.u64[0] >= a.size)
437								{
438									min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
439										min_val[0] = val.u64[0], min_val[1] = val.u64[1];
440									break;
441								}
442							}
443						}
444					}
445				}
446				// If I found a place, stop, and exit.
447				if (min_y > 0 || min_x < tensor_block_size + 1)
448				{
449					min_i = i;
450					break;
451				}
452				// There is no space to insert this block, mark it as such.
453				tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
454				if (tensor_blocks[a.index].companion_ref)
455				{
456					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
457					tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
458				}
459			}
460		}
461		// If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
462		// and default to largest size available.
463		ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))));
464		if (min_i == -1)
465		{
466			allocated_size[num_assigned] = a.size;
467			++num_assigned;
468		}
469		int assign_group = num_assigned;
470		if (min_y > 0)
471		{
472			assign_group = assigned[min_y - 1];
473			// The y and x should belong to the same assigned group.
474			assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
 - 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
 tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 474, __extension__ __PRETTY_FUNCTION__
); }));
475		} else if (min_x < tensor_block_size + 1)
476			assign_group = assigned[min_x - 1];
477		// If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
478		if (min_y != 0 || min_x != tensor_block_size + 1)
479		{
480			uint64_t val[2] = {
481				min_val[0], min_val[1]
482			};
483			assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
 ({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 483, __extension__ __PRETTY_FUNCTION__
); }));
484			val[0] -= a.size;
485			val[1] = val[1] + a.size; // Move the offset to the next one.
486			ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
487		}
488		int strings[3];
489		strings[0] = a.index + 1;
490		int string_size = 1;
491		// Assign out designated companion if it exist.
492		if (tensor_blocks[a.index].companion_ref)
493		{
494			const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
495			assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
 ("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 495, __extension__ __PRETTY_FUNCTION__
); }));
496			const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
497			if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
498			{
499				for (i = 0; i < string_size; i++)
500					strings[i + 1] = strings[i];
501				strings[0] = companion_ref + 1;
502			} else {
503				const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
504				if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
505					strings[string_size] = companion_ref + 1;
506				else {
507					// Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
508					assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
 if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 508, __extension__ __PRETTY_FUNCTION__
); }));
509					strings[2] = strings[1];
510					strings[1] = companion_ref + 1;
511				}
512			}
513			++string_size;
514		}
515		// Assign out and update oc.
516		for (i = 0; i < string_size; i++)
517		{
518			const int index = strings[i] - 1;
519			// Assign out the selected one.
520			assigned[index] = assign_group;
521			// The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
522			allocated_offset[index] = min_val[1];
523			if (adj[index].itf)
524				for (k = 0; k < adj[index].itf->rnum; k++)
525				{
526					const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[
index].itf)->rsize * (size_t)(k)));
527					if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !(
(tensor_blocks[d].flags & 0x3) == UNASSIGNED)))
528						--adj[d].oc;
529				}
530		}
531		uint64_t val[2] = {
532			a.size, min_val[1]
533		};
534		uint64_t consumed_size = 0;
535		// Go over from min_y to string_size (excluding min_x).
536		for (i = 0; i < string_size; i++)
537		{
538			const uint64_t size = tensor_blocks[strings[i] - 1].size;
539			assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 539, __extension__ __PRETTY_FUNCTION__
); }));
540			// Update consumed size if it is bigger than "size".
541			if (size > consumed_size)
542			{
543				val[0] = size - consumed_size;
544				ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
545				consumed_size = size;
546				val[1] = min_val[1] + consumed_size;
547			}
548			// If it consumed all the flow, break out.
549			if (consumed_size == a.size)
550				break;
551		}
552		for (i = 0; i < string_size; i++)
553		{
554			const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
555			uint64_t val[2] = {
556				i_size, min_val[1]
557			};
558			uint64_t consumed_size = 0;
559			for (k = i + 1; k < string_size; k++)
560			{
561				const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
 < _b) ? _a : _b; });
562				// Update consumed size if it is bigger than "size".
563				if (size > consumed_size)
564				{
565					val[0] = size - consumed_size;
566					ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
567					consumed_size = size;
568					val[1] = min_val[1] + consumed_size;
569				}
570				// If it consumed all the flow, break out.
571				if (consumed_size == i_size)
572					break;
573			}
574			val[0] = i_size - consumed_size;
575			// Still have residual, flow it to min_x.
576			if (val[0] > 0)
577				ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
578		}
579		if (min_i == -1)
580		{
581			// If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
582			const int p = strings[0] - 1;
583			const int q = strings[string_size - 1] - 1;
584			const int type = tensor_blocks[p].type;
585#define for_block(y, val) do { \
586				if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \
587				{ \
588					tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
589					if (tensor_blocks[y].companion_ref) \
590					{ \
591						const int companion_ref = tensor_blocks[y].companion_ref - 1; \
592						tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
593					} \
594				} \
595			} while(0)
596			ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
597			if (y_vector)
598				CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
599#undef for_block
600#define for_block(x, val) do { \
601				if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \
602				{ \
603					tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
604					if (tensor_blocks[x].companion_ref) \
605					{ \
606						const int companion_ref = tensor_blocks[x].companion_ref - 1; \
607						tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
608					} \
609				} \
610			} while(0)
611			ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
612			if (x_vector)
613				CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
614#undef for_block
615		}
616		j += string_size;
617	}
618	ccfreefree(tensor_block_cannot_insert);
619	ccfreefree(buf);
620	ccv_array_free(opt);
621	ccv_matrix_free(tensor_df);
622	ccv_matrix_free(tensor_dt);
623#define for_block(y, x, val) do { \
624		if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
625		{ \
626			if (!alloc_dep[x - 1]) \
627				alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
628			ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
629		} \
630	} while (0)
631	CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
 ((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
 _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
 = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
 _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
 !_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
 (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
 { switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
 } while (0);
632#undef for_block
633	ccv_matrix_free(alloc);
634	for (i = 0; i < tensor_block_size; i++)
635		if (adj[i].itf)
636			ccv_array_free(adj[i].itf);
637	ccfreefree(adj);
638	ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
639	alloc_prep->alloc_dep = alloc_dep;
640	alloc_prep->vt_block_size = tensor_block_size;
641	alloc_prep->buffer_size = num_assigned;
642	alloc_prep->block_size = available_tensor_size;
643	alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
644	alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
645	alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
646	memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
647	for (i = 0; i < num_assigned; i++)
648		alloc_prep->buffers[i].size = allocated_size[i];
649	if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels()))
650	{
651		size_t total_size = 0;
652		for (i = 0; i < num_assigned; i++)
653			total_size += allocated_size[i];
654		PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf
("Total buffer size of %zu to be allocated\n", total_size); fflush
(stdout); } } while (0);
655	}
656	ccfreefree(allocated_size);
657	j = 0;
658	// Assigning out the tensors (in case of sharing tensors / in-place ops).
659	for (i = 0; i < tensor_block_size; i++)
660		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
661		{
662			alloc_prep->blocks[j].block_ref = i;
663			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
664			{
665				alloc_prep->vt_blocks[i] = j;
666				// Also, set its allocations.
667				assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 667, __extension__ __PRETTY_FUNCTION__
); }));
668				const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
669				alloc_prep->blocks[j].offset = allocated_offset[i];
670				if (!alloc_prep->buffers[buffer_ref].type)
671					alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
672				alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
673				alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
674				assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
 alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
 ({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 674, __extension__ __PRETTY_FUNCTION__
); }));
675			} else {
676				alloc_prep->vt_blocks[i] = -1;
677				alloc_prep->blocks[j].buffer_ref = -1;
678				alloc_prep->blocks[j].offset = 0;
679			}
680			++j;
681		} else
682			alloc_prep->vt_blocks[i] = -1;
683	ccfreefree(allocated_offset);
684	ccfreefree(assigned);
685	return alloc_prep;
686}
687 
688static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
689{
690	int i;
691	for (i = 0; i < alloc_prep->vt_block_size; i++)
692		if (alloc_prep->alloc_dep[i])
693			ccv_array_free(alloc_prep->alloc_dep[i]);
694	for (i = 0; i < alloc_prep->buffer_size; i++)
695		if (alloc_prep->buffers[i].dup_p_refs)
696			ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
697	ccfreefree(alloc_prep->alloc_dep);
698	ccfreefree(alloc_prep);
699}
700 
701// Simple allocator from ccv_array_t.
702static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
703{
704	int pos = tensor_metadata->rnum;
705	int rsize = (size + 15) / 16;
706	ccv_array_resize(tensor_metadata, pos + rsize);
707	return (pos << 1) + 1;
708}
709 
710static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
711{
712	assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 712, __extension__ __PRETTY_FUNCTION__
); }));
713	return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)));
714}
715 
716#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
717 
718static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
719{
720	// If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
721	if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
722		return vt_tensor;
723	ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
724	if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
725	{
726		const int alias_ref = tensor->alias_ref;
727		tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
728		_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
729	}
730	if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
731	{
732		ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
733		int i;
734		const int count = mv->kind + mv->repeat;
735		for (i = 0; i < count; i++)
736		{
737			if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1))
738			{
739				const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
740				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
741				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
742			}
743		}
744		// No need to recursively do parent pointer, otherwise we are in deep rewire.
745		if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
746			mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
747		if (mv->sp)
748			for (i = 0; i < mv->sp->rnum; i++)
749			{
750				ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
751				if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
752				{
753					const int pos = (int)(intptr_t)*tensor;
754					*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
755					assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 755, __extension__ __PRETTY_FUNCTION__
); }));
756					_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
757				}
758			}
759	}
760	return tensor;
761}
762 
763typedef struct {
764	const uint8_t* ptr;
765	int pos;
766} ccv_nnc_tensor_block_pos_t;
767 
768static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
769{
770	int i;
771	int unref_block_ref = block_ref;
772	while (prep->tensor_blocks[unref_block_ref].ref)
773		unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
774	int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
775	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 775, __extension__ __PRETTY_FUNCTION__); }));
776	assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
 == prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
 ("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 776, __extension__ __PRETTY_FUNCTION__
); }));
777	const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
778	uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
779	int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
780	for (i = idx - 1; i >= 0; i--)
781	{
782		assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
 (p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 782, __extension__ __PRETTY_FUNCTION__); }));
783		const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
784		const int unroll_count = graph_prep->unroll_count;
785		if (ch[i]) // Prefer the dup side of things.
786			p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
787		int unref_p_ref = p_ref;
788		while (graph_prep->tensor_blocks[unref_p_ref].ref)
789			unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
790		vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
791		const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
792		offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
793		// If the buffer already exists, prefer that.
794		const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
795		if (ptr)
796		{
797			// If I have any remaining path that is not covered from 0, I cannot possibly
798			// have any pointer from buffer (that can only happen if it is not dup).
799			for (--i; i >= 0; i--)
800				if (ch[i] != 0)
801					return 0;
802			// Try to find the created tensor block pos in the array, just linear scan.
803			const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
804			ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
805			*tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
806			ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
807			return tv_pos;
808		}
809		p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
810	}
811	return 0;
812}
813 
814// Descent from root to the prep level, and compose multiview from there.
815static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
816{
817	assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 817, __extension__ __PRETTY_FUNCTION__); }));
818	int i;
819	const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
820	const int unroll_count = prep->unroll_count;
821	if (prep == graph_prep)
822	{
823		const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
824		if (!data_pos)
825			return -1;
826		// Based on ch, go all the way back to find the exact pointer to compose.
827		if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
828			prep->dup_tensor_block_ref &&
829			prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
830			prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
831		{
832			int pos[unroll_count + 1];
833			pos[0] = data_pos;
834			for (i = 0; i < unroll_count; i++)
835				pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
836			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
837			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
838			ccv_nnc_tensor_t* data[unroll_count + 1];
839			for (i = 0; i < unroll_count + 1; i++)
840				data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
841			ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
842			for (i = 0; i < unroll_count + 1; i++)
843				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
844			*pos_ref = mv_pos;
845		} else {
846			*pos_ref = data_pos;
847		}
848		if (preserve)
849		{
850			// If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
851			// at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
852			// mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
853			// mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
854			// arena allocated).
855			// mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
856			// a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
857			// it to a K01 structure.
858			// Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
859			// to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
860			// memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
861			int prev_mv_pos = *pos_ref;
862			if (prev_mv_pos == -1)
863			{
864				prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
865				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
866				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
867				ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
868					tv,
869				}, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
870				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
871			}
872			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
873			ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
874			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
875			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
876				CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
877				(ccv_nnc_tensor_t*)prev_mv,
878			}, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
879			prev_mv->p = (void*)(intptr_t)mv_pos;
880			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
881			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
882			*pos_ref = mv_pos;
883		}
884		return 0;
885	}
886	ch[idx] = 0;
887	int pos[unroll_count + 1];
888	pos[0] = 0;
889	const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
890	assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 890, __extension__ __PRETTY_FUNCTION__); }));
891	for (i = 0; i < unroll_count; i++)
892	{
893		ch[idx] = i + 1;
894		pos[i + 1] = 0;
895		const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
896		if (dup_retval < 0)
897		{
898			assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 898, __extension__ __PRETTY_FUNCTION__); }));
899			break;
900		}
901	}
902	// If current prep has no dup.
903	if (i == 0)
904	{
905		*pos_ref = pos[0];
906		return 0;
907	}
908	ccv_nnc_tensor_t* data[unroll_count + 1];
909	// Compose to a new multiview.
910	for (i = 0; i < unroll_count + 1; i++)
911		{ assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
 (pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 911, __extension__ __PRETTY_FUNCTION__); })); }
912	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
913	for (i = 0; i < unroll_count + 1; i++)
914		data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
915	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
916	ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
917	for (i = 0; i < unroll_count + 1; i++)
918		if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
919			((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
920	for (i = 0; i < unroll_count + 1; i++)
921		CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
922	*pos_ref = mv_pos;
923	return 0;
924}
925 
926static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
927{
928	int i;
929	int is_input = 0;
930	assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
 else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 930, __extension__ __PRETTY_FUNCTION__); }));
931	for (i = 0; i < node->input_size && !is_input; i++)
932		if (p_ref == node->inputs[i])
933			is_input = 1;
934	int is_output = 0;
935	for (i = 0; i < node->output_size && !is_output; i++)
936		if (p_ref == node->outputs[i])
937			is_output = 1;
938	// Prefer it is an output if it is both the input and the output.
939	if (is_output)
940		return 1;
941	if (is_input)
942		return -1;
943	return 0;
944}
945 
946static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
947{
948	// No need to check whether to preserve if this is not a while loop.
949	if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
950		return 0;
951	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 951, __extension__ __PRETTY_FUNCTION__
); }));
952	// If it is unassigned, no need to preserve.
953	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
954		return 0;
955	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
956	// If p is not input, no need to preserve at all.
957	if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
958		return 0;
959	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
960	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 960, __extension__ __PRETTY_FUNCTION__); }));
961	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 961, __extension__ __PRETTY_FUNCTION__
); }));
962	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
963	// If the buffer is a truly read-only one, no need to preserve.
964	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
965		return 0;
966	/* This needs detailed explanation, what does preserve mean?
967	 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
968	 * also used outside of the while loop, we cannot reuse the memory region of x for
969	 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
970	 * y uses the same memory region as x). The way to workaround this is by using a different
971	 * memory region for y = x + 1, but for the first iteration, having x pointing to the
972	 * original. During the allocation process, the way to identify whether x should preserve
973	 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
974	 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
975	 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
976	 * it is the input tensor whenever that is possible. A tensor block can point to two parent
977	 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
978	 * tensor whenever that is possible. */
979	if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
980		return 0;
981	// Otherwise, return 1 because we now need to preserve.
982	return 1;
983}
984 
985static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
986{
987	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 987, __extension__ __PRETTY_FUNCTION__
); }));
988	// If it is unassigned, no need to preserve.
989	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
990		return 0;
991	// Only tape var need to force broadcast, otherwise we already share the same memory region.
992	if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
993		return 0;
994	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
995	// If p is not output, no need to broadcast at all.
996	if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
997		return 0;
998	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
999	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 999, __extension__ __PRETTY_FUNCTION__); }));
1000	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1000, __extension__ __PRETTY_FUNCTION__
); }));
1001	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
1002	// If the buffer is a truly read-only one, no need to broadcast.
1003	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
1004		return 0;
1005	// Otherwise, return 1 because we now need to force broadcast for this tape var.
1006	return 1;
1007}
1008 
1009static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
1010{
1011	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1011, __extension__ __PRETTY_FUNCTION__); }));
1012	int i;
1013	for (i = 0; i < mv->kind + mv->repeat; i++)
1014		if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
1015			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = tensor;
1016		else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
1017			_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i], tensor);
1018}
1019 
1020static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1021{
1022	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1022, __extension__ __PRETTY_FUNCTION__); }));
1023	int i;
1024	if (mv->sp)
1025		for (i = 0; i < mv->sp->rnum; i++)
1026		{
1027			ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
1028			if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
1029			{
1030				const int pos = (int)(intptr_t)*tensor;
1031				*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1032				assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 1032, __extension__ __PRETTY_FUNCTION__
); }));
1033				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1034			}
1035		}
1036	for (i = 0; i < mv->kind + mv->repeat; i++)
1037	{
1038		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]) & 1))
1039			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1040		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]->alias_ref) & 1))
1041			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref);
1042		if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
1043			_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1044	}
1045}
1046 
1047static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1048{
1049	// Go to the root of the graph.
1050	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1051	int i;
1052	for (i = 1; prep->p; i++)
1053		prep = prep->p;
1054	// Root graph should have no dup tensor blocks.
1055	assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
 ({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1055, __extension__ __PRETTY_FUNCTION__); }));
1056	const int c = i;
1057	const ccv_nnc_symbolic_graph_prep_t* preps[c];
1058	prep = graph_prep;
1059	preps[c - 1] = prep;
1060	for (i = 0; prep->p; i++)
1061		preps[c - 2 - i] = prep = prep->p;
1062	int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1063	memset(ch, 0, sizeof(int) * c);
1064	int pos = 0;
1065	_ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1066	assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
 (ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1066, __extension__ __PRETTY_FUNCTION__); })); // This shouldn't never be modified.
1067	assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
 > 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1067, __extension__ __PRETTY_FUNCTION__); }));
1068	return pos;
1069}
1070 
1071static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1072{
1073	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1074	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1075	ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1076	ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1077		CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1078		tv,
1079	}, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1080	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1081	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = tensor;
1082	return mv_pos;
1083}
1084 
1085static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1086{
1087	ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1088	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1089	if (!is_multiview)
1090		return pos;
1091	while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1092	{
1093		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1094		tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1095	}
1096	const ccv_nnc_tensor_t tensor = *tensor_ptr;
1097	const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1098	ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1099	*new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1100	new_tensor->dataof = tensor.dataof;
1101	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1102	new_tensor->alias_ref = (uintptr_t)pos;
1103	ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1104	return new_pos;
1105}
1106 
1107static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1108{
1109	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1110	// It referenced to is not an alias.
1111	assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__
 ({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1111, __extension__ __PRETTY_FUNCTION__
); }));
1112	const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1113	const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1114	assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1114, __extension__ __PRETTY_FUNCTION__
); }));
1115	// Will use that to determine whether insert reference or not.
1116	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1117	while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1118	{
1119		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1120		alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1121	}
1122	const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1123	// If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1124	int pos;
1125	if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1126		ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim))
1127	{
1128		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1129		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1130		*tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1131		tensor->dataof = alias_tensor.dataof;
1132	} else {
1133		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1134		ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1135		// Otherwise initialize a tensor view
1136		*tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1137		tensor_view->alias_ref = (uintptr_t)alias_pos;
1138	}
1139	vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1140	if (is_multiview)
1141	{
1142		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1143		ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1144	}
1145}
1146 
1147static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1148{
1149	// If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1150	if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref])
1151	{
1152		const int ref = tensor_blocks[block_ref].alias_ref - 1;
1153		if (!vt_tensors[ref])
1154			_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1155		vt_tensors[block_ref] = vt_tensors[ref];
1156		return;
1157	}
1158	assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1158, __extension__ __PRETTY_FUNCTION__
); }));
1159	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1160	// If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1161	if (!vt_tensors[alias_ref])
1162		_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1163	_ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1164}
1165 
1166// Turn a linear pointer to an object storage (such as MTLBuffer).
1167#ifdef HAVE_MPS
1168static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1169{
1170	mpobjfree(0, ptr);
1171}
1172#endif
1173 
1174typedef struct {
1175	size_t size;
1176	void* obj;
1177} tensor_arena_obj_track_t;
1178 
1179typedef struct {
1180	void* ptr;
1181	off_t offset;
1182	size_t size;
1183} obj_ptr_key_t;
1184 
1185static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1186{
1187	return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1188}
1189 
1190static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1191{
1192	return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1193}
1194 
1195KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied
, upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* *
vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__
)) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t
*)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__
 ((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if
 (h) { free((void *)h->keys); free(h->flags); free((void
 *)h->vals); free(h); } } static inline __attribute__ ((__unused__
)) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h
->flags) { memset(h->flags, 0xaa, ((h->n_buckets) <
 16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t
 key) { if (h->n_buckets) { khint_t k, i, last, mask, step
 = 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key
); i = k & mask; last = i; while (!((h->flags[i>>
4]>>((i&0xfU)<<1))&2) && (((h->
flags[i>>4]>>((i&0xfU)<<1))&1) || !
_kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step
)) & mask; if (i == last) return h->n_buckets; } return
 ((h->flags[i>>4]>>((i&0xfU)<<1))&
3)? h->n_buckets : i; } else return 0; } static inline __attribute__
 ((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t
 new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (
--(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1,
 (new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|=
(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>>
8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
 >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys
 = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets *
 sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return
 -1; } h->keys = new_keys; if (1) { void* *new_vals = (void
**)realloc((void *)h->vals,new_n_buckets * sizeof(void*));
 if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals
; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if
 (((h->flags[j>>4]>>((j&0xfU)<<1))&
3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t
 new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals
[j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<<
1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t
 tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp
; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<
1)); } else { h->keys[i] = key; if (1) h->vals[i] = val
; break; } } } } if (h->n_buckets > new_n_buckets) { h->
keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets
 * sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc
((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h->
flags); h->flags = new_flags; h->n_buckets = new_n_buckets
; h->n_occupied = h->size; h->upper_bound = (khint_t
)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static
 inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t
 *h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied
 >= h->upper_bound) { if (h->n_buckets > (h->size
<<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) <
 0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr
(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets
; } } { khint_t k, i, site, last, mask = h->n_buckets - 1,
 step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
 (!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key
))) { if (((h->flags[i>>4]>>((i&0xfU)<<
1))&1)) site = i; i = (i + (++step)) & mask; if (i ==
 last) { x = site; break; } } if (x == h->n_buckets) { if (
((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && site != h->n_buckets) x = site; else x = i; }
 } } if (((h->flags[x>>4]>>((x&0xfU)<<
1))&2)) { h->keys[x] = key; (h->flags[x>>4]&=
~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->
n_occupied; *ret = 1; } else if (((h->flags[x>>4]>>
((x&0xfU)<<1))&1)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; *ret = 2; } else *ret = 0; return x; } static
 inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t
 *h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
 --h->size; } }
1196 
1197static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1198{
1199	if (params.dim[0] == 0)
1200		return 0;
1201#ifdef HAVE_MPS
1202	if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY)
1203	{
1204		int ret;
1205		const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >>
 12] * ccv_nnc_tensor_count(params);
1206		const obj_ptr_key_t key = {
1207			.ptr = ptr,
1208			.offset = offset,
1209			.size = size,
1210		};
1211		khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret);
1212		if (ret != 0)
1213		{
1214			void* obj = mpobjcreate(ptr, offset, size);
1215			if (!tensor_arena->disposers)
1216				tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1217			ccv_nnc_arena_disposer_t disposer = {
1218				.ptr = obj,
1219				.userdata = 0,
1220				.dispose = _ccv_nnc_tensor_arena_obj_dispose
1221			};
1222			ccv_array_push(tensor_arena->disposers, &disposer);
1223			kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj;
1224			return obj;
1225		} else
1226			return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]);
1227	}
1228#endif
1229	return ptr + offset;
1230}
1231 
1232static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1233{
1234	// All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1235	// Each tensor have the designation in assigned array, and offset in allocated_offset.
1236	const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1237	ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1238	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1239	const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1240	const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1241	const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1242	const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1243	const int unroll_count = graph_prep->unroll_count;
1244	int i, j;
1245	for (i = 0; i < tensor_symbol_info_size; i++)
1246		for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1247		{
1248			const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1249			if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1250				TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
);
1251		}
1252	ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1253	graph_prep->tensor_arena = tensor_arena;
1254	tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1255	tensor_arena->buffers = (void*)(tensor_arena + 1);
1256	tensor_arena->buffer_size = alloc_prep->buffer_size;
1257	tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1258	tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1259	tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1260	tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1261	tensor_arena->pb_vt_tensors = 0;
1262	tensor_arena->vt_alias_r_refs_p = 0;
1263	tensor_arena->vt_alias_r_refs = 0;
1264	tensor_arena->vt_sizes = 0;
1265	tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1266	tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, (sizeof(ccv_nnc_tensor_t) * tensor_symbol_info_size + 15) / 16, 0);
1267	tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1268	tensor_arena->allocator.context.free = allocator.context.free;
1269	tensor_arena->allocator.isa = allocator.isa;
1270	tensor_arena->disposers = 0;
1271	// Copy alias_ref info back to the tensor arena.
1272	for (i = 0; i < tensor_symbol_info_size; i++)
1273		tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1274	// Do the buffer copies.
1275	for (i = 0; i < alloc_prep->buffer_size; i++)
1276		tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1277			tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1278			tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1279	if (graph_prep->while_count_tensor)
1280	{
1281		// If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1282		int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1283		assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
 ({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1283, __extension__ __PRETTY_FUNCTION__
); })); // pos must be 0 position.
1284		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1285		*tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1286	}
1287	assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
 && !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
 && p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1287, __extension__ __PRETTY_FUNCTION__
); }));
1288	if (p_arena && p_graph_prep)
1289	{
1290		// Don't need to allocate the actual buffer, just use the pointer from the above.
1291		PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer assignment for sub arena %p (parent %p)\n",
 tensor_arena, p_arena); fflush(stdout); } } while (0);
1292		for (i = 0; i < tensor_arena->buffer_size; i++)
1293		{
1294			const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1295			int unref_p_ref = p_ref;
1296			while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1297				unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1298			assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
 ({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1298, __extension__ __PRETTY_FUNCTION__
); }));
1299			const int p_unroll_count = p_graph_prep->unroll_count;
1300			if (p_graph_prep->dup_tensor_block_ref &&
1301				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1302				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1303			{
1304				// This condition means in the parent graph, we point to multiple tensor blocks for the same
1305				// buffer, therefore, we cannot have one single pointer assigned in this case.
1306				// Later we will handle this by generate ccv_tensor_multiview_t structure.
1307				tensor_arena->buffers[i].ptr = 0;
1308				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1309				continue;
1310			}
1311			// Otherwise, find the actual buffer pointer.
1312			const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1313			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1313, __extension__ __PRETTY_FUNCTION__); }));
1314			const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1315			if (!p_arena->buffers[buffer_ref].ptr)
1316			{
1317				// Pass it down as 0 ptr.
1318				tensor_arena->buffers[i].ptr = 0;
1319				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1320				continue;
1321			}
1322			const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1323			tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1324			PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
 (0);
1325		}
1326	} else {
1327		// Now, allocate actual buffers.
1328		PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0);
1329		for (i = 0; i < tensor_arena->buffer_size; i++)
1330		{
1331			const int buffer_type = tensor_arena->buffers[i].type;
1332			const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1333			if (tensor_arena->buffers[i].size == 0)
1334			{
1335				tensor_arena->buffers[i].ptr = 0;
1336				PRINT(CCV_CLI_VERBOSE, "|-Skip buffer %d with size 0\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Skip buffer %d with size 0\n", i); fflush(stdout
); } } while (0);
1337				continue;
1338			}
1339#ifdef HAVE_CUDA1
1340			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1341			{
1342				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1343				if (allocator.isa && allocator.isa->alloc)
1344					tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1345				else
1346					tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1347				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1348			} else {
1349				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1349, __extension__ __PRETTY_FUNCTION__
); }));
1350				if (tensor_arena->buffers[i].pin_mem)
1351					tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1352				else
1353					ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1354				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1355			}
1356#elif defined(HAVE_MPS)
1357			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1358			{
1359				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1360				// if (allocator.isa && allocator.isa->alloc)
1361				// 	tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1362				// else
1363				tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1364				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1365			} else {
1366				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1366, __extension__ __PRETTY_FUNCTION__
); }));
1367				ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1368				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1369			}
1370#else
1371			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1371, __extension__ __PRETTY_FUNCTION__
); }));
1372			ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1373			PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1374#endif
1375			assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
 ({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
 ("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1375, __extension__ __PRETTY_FUNCTION__); }));
1376		}
1377	}
1378	// Go over sub_preps and allocate arenas for them. Do it this early because
1379	// we may reference tensors from sub arenas, the reason why we need to reference
1380	// tensors from sub arenas is because for output tensors, sub arena's tensor
1381	// will have automatic reference updates.
1382	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1383		if (graph_prep->sub_preps[i])
1384			tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1385		else
1386			tensor_arena->sub_arenas[i] = 0;
1387	memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1388	// Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1389	ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1390#ifdef HAVE_MPS
1391	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr();
1392#else
1393	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0;
1394#endif
1395	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1396		if (tensor_arena->sub_arenas[i])
1397		{
1398			assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
 ({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1398, __extension__ __PRETTY_FUNCTION__
); }));
1399			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1400			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1401			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1402				for (j = 0; j < node->output_size; j++)
1403				{
1404					const int idx = node->outputs[j];
1405					const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1;
1406					assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
 (s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1406, __extension__ __PRETTY_FUNCTION__); }));
1407					ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1408					assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
 ({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
 ("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1408, __extension__ __PRETTY_FUNCTION__); }));
1409					ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1410					// Only assign if it is a multiview tensor.
1411					if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1412						(sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1413						sub_arena_out_tensors[idx] = sub_tensor;
1414				}
1415		}
1416	// Assigning out the tensors (in case of sharing tensors / in-place ops).
1417	for (i = 0; i < tensor_symbol_info_size; i++)
1418		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
1419		{
1420			const int vt_ref = alloc_prep->vt_blocks[i];
1421			const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1422			// Either we have dup_tensor_block_ref in current layer, or we have that in
1423			// previous layer, therefore, cannot really find the buffer ptr.
1424			if ((!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1425				((graph_prep->dup_tensor_block_ref &&
1426				  graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1427				  graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1428				 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1429			{
1430				assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1430, __extension__ __PRETTY_FUNCTION__
); })); // This must be in a sub-graph.
1431				// If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1432				if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1433					continue;
1434				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1435				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1436				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1437			} else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1438				// When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1439				const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1440				// If already created, use the same tensor, and continue.
1441				// Having ptr.
1442				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1443				ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1444				// Also, set its allocations.
1445				// Since tensor view is bit compatible with tensor, we can just cast.
1446				void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1447				*tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1448				assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1448, __extension__ __PRETTY_FUNCTION__
); }));
1449				// If we need to force broadcast, we need to wrap it in a multiview.
1450				if (graph_prep->tensor_blocks[i].p_refs[0] &&
1451					_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1452				{
1453					const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1454					ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1455					ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1456					ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1457						tv,
1458					}, 0, 1, graph_prep->graph, mv);
1459					CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1460					pos = mv_pos;
1461					ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1462				}
1463				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1464			}
1465		}
1466#ifdef HAVE_MPS
1467	kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map);
1468#endif
1469	// Handle binded tensors. First handle cases without aliases.
1470	for (i = 0; i < tensor_bind_size; i++)
1471	{
1472		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1472, __extension__ __PRETTY_FUNCTION__
); }));
1473		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1474		if (resolved_symbol.d >= 0)
1475		{
1476			int d = resolved_symbol.d;
1477			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1478				continue;
1479			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1480			// It has nothing to do with alias.
1481			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1482				d = tensor_blocks[d].ref - 1;
1483			// For binded tensors, it shouldn't be assigned yet.
1484			// If it is assigned, the pointer should match the ones from the binded tensor.
1485			// This can only happen if an enforced in-place tensor is binded twice. If that
1486			// happens, we need to make sure it is binded to the same location.
1487			assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1487, __extension__ __PRETTY_FUNCTION__
); }));
1488			// See above assertion.
1489			if (tensor_arena->vt_tensors[d])
1490				continue;
1491			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1492			{
1493				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1494				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1495				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1496				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1497					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1498						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1498, __extension__ __PRETTY_FUNCTION__
); })); }
1499				// It is OK to be just as a whole smaller or equal to the binded one.
1500				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1500, __extension__ __PRETTY_FUNCTION__
); }));
1501				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1502				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1503				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1504			} else {
1505				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1506				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1507				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1508				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1509				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1510				tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1511				tv->dataof = tensor_binds[i].tensor->dataof;
1512				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1513			}
1514		}
1515	}
1516	// Handle binded tensors. We handle alias here so it can reference to binded tensors.
1517	for (i = 0; i < tensor_bind_size; i++)
1518	{
1519		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1519, __extension__ __PRETTY_FUNCTION__
); }));
1520		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1521		if (resolved_symbol.d >= 0)
1522		{
1523			int d = resolved_symbol.d;
1524			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1525				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1526			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1527			// It has nothing to do with alias.
1528			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1529				d = tensor_blocks[d].ref - 1;
1530			if (tensor_arena->vt_tensors[d])
1531				continue;
1532			// Assert original alias has no ofs. Otherwise our binding will be problematic.
1533			for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1534				{ assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j]
 == 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol
.d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1534, __extension__ __PRETTY_FUNCTION__
); })); }
1535			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1536			{
1537				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1538				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1539				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1540				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1541					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1542						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1542, __extension__ __PRETTY_FUNCTION__
); })); }
1543				// It is OK to be just as a whole smaller or equal to the binded one.
1544				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1544, __extension__ __PRETTY_FUNCTION__
); }));
1545				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1546				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1547				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1548			} else {
1549				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1550				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1551				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1552				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1553				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1554				tv->data = tensor_binds[i].tensor->data;
1555				tv->dataof = tensor_binds[i].tensor->dataof;
1556				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1557			}
1558		}
1559	}
1560	// Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1561	// Avoiding refs that actually is an alias.
1562	for (i = 0; i < tensor_symbol_info_size; i++)
1563		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1564		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref)
1565		{
1566			int ref = tensor_blocks[i].ref - 1;
1567			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1568				ref = tensor_blocks[ref].ref - 1;
1569			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1569, __extension__ __PRETTY_FUNCTION__); }));
1570			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1571		}
1572	// Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1573	if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1574	{
1575		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1575, __extension__ __PRETTY_FUNCTION__
); }));
1576		const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1577		const int p_idx = graph_prep->p_idx - 1;
1578		for (i = 0; i < node->input_size; i++)
1579		{
1580			const int idx = node->inputs[i];
1581			int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx))) - 1;
1582			assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
 ({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1582, __extension__ __PRETTY_FUNCTION__); }));
1583			const int vt_ref = alloc_prep->vt_blocks[block_ref];
1584			if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1585				continue;
1586			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1586, __extension__ __PRETTY_FUNCTION__); }));
1587			const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1588			assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1588, __extension__ __PRETTY_FUNCTION__); }));
1589			assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1589, __extension__ __PRETTY_FUNCTION__
); }));
1590			// Either we have dup_tensor_block_ref in current layer, or we have that in
1591			// previous layer, therefore, cannot really find the buffer ptr.
1592			if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1593				((graph_prep->dup_tensor_block_ref &&
1594				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1595				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1596				 !tensor_arena->buffers[buffer_ref].ptr))
1597			{
1598				// We haven't allocated anything for this yet.
1599				assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
 ? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1599, __extension__ __PRETTY_FUNCTION__
); }));
1600				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1601				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1602				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1603			} else {
1604				const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1605				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1606				ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1607			}
1608		}
1609	}
1610	// For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1611	// This created the multi-view tensor to achieve that.
1612	for (i = 0; i < tensor_symbol_info_size; i++)
1613		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1614		{
1615			const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1616			// Create phi multi-view.
1617			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1618			const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1619			const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1620			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1621			ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1622			ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1623			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1624				intv,
1625				outv,
1626			}, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1627			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1628			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1629			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1630			ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1631		}
1632	// Now it is time to handle alias.
1633	for (i = 0; i < alloc_prep->block_size; i++)
1634		if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1635		{
1636			const int block_ref = alloc_prep->blocks[i].block_ref;
1637			if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref])
1638			{
1639				// Assigning out the tensor aliases.
1640				assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1640, __extension__ __PRETTY_FUNCTION__
); }));
1641				_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1642			}
1643		}
1644	// Now assigning out the rest of alias refs.
1645	for (i = 0; i < tensor_symbol_info_size; i++)
1646		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1647		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i])
1648		{
1649			int ref = tensor_blocks[i].alias_ref - 1;
1650			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1650, __extension__ __PRETTY_FUNCTION__); }));
1651			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1652		}
1653	// Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1654	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1655		if (tensor_arena->sub_arenas[i])
1656		{
1657			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1658			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1659			for (j = 0; j < node->input_size; j++)
1660			{
1661				const int idx = node->inputs[j];
1662				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1663				if (s_idx < 0)
1664					continue;
1665				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1666				// Only do the replacement if it is a multi-view tensor.
1667				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1668				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1669				{
1670					// It cannot be binded tensor.
1671					assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1671, __extension__ __PRETTY_FUNCTION__
); }));
1672					const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1673					const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1674					ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1675					// If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1676					// to this tensor.
1677					if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1678					{
1679						const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1680						ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1681						ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1682						ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1683						ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1684						ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
 : (multiview)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]);
1685						while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1686							tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
 ? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]);
1687						*ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1688						ref_tensor->data = tv->data;
1689						ref_tensor->dataof = tv->dataof;
1690						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1691					} else
1692						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1693				}
1694			}
1695		}
1696	// After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1697	// No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1698	// when initialize case..of node, which will take the phi multi-view again.
1699	for (i = 0; i < tensor_symbol_info_size; i++)
1700		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1701		{
1702			assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
 & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1702, __extension__ __PRETTY_FUNCTION__
); }));
1703			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1704			assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1704, __extension__ __PRETTY_FUNCTION__); }));
1705			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1706		}
1707	// rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1708	for (i = 0; i < tensor_symbol_info_size; i++)
1709		if (tensor_arena->vt_tensors[i])
1710			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1711	// Associate multiview tensors from sub arena to the parent.
1712	if (sub_arena_out_tensors)
1713	{
1714		for (i = 0; i < alloc_prep->block_size; i++)
1715			if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1716			{
1717				const int block_ref = alloc_prep->blocks[i].block_ref;
1718				if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1719					continue;
1720				int sub_arena_ref = block_ref;
1721				if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1722				{
1723					// Assigning out the tensor aliases.
1724					assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1724, __extension__ __PRETTY_FUNCTION__
); }));
1725					const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1726					// It referenced to is not an alias.
1727					assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
 0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1727, __extension__ __PRETTY_FUNCTION__
); }));
1728					sub_arena_ref = alias_ref;
1729					if (!sub_arena_out_tensors[sub_arena_ref])
1730						continue;
1731				}
1732				if (!sub_arena_out_tensors[sub_arena_ref])
1733					continue;
1734				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1735				assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1735, __extension__ __PRETTY_FUNCTION__); }));
1736				// This is only possible if the vt_tensors is a phi node.
1737				if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1738				{
1739					// For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1740					ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1741					assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
 ("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1741, __extension__ __PRETTY_FUNCTION__); }));
1742					assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
 ? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1742, __extension__ __PRETTY_FUNCTION__
); }));
1743					CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]->alias_ref = (uintptr_t)mv;
1744					ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]);
1745				} else {
1746					tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1747					ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1748				}
1749			}
1750	}
1751	// Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1752	// 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1753	// 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1754	// Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1755	// to the output of assign_ref tensor.
1756	for (i = 0; i < tensor_symbol_info_size; i++)
1757		if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1758		{
1759			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1760			ccv_nnc_tensor_t* assign_tensor;
1761			if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1762				assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1763			else
1764				assign_tensor = tensor_arena->vt_tensors[assign_ref];
1765			ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1766		}
1767	// After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1768	for (i = 0; i < tensor_bind_size; i++)
1769	{
1770		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1770, __extension__ __PRETTY_FUNCTION__
); }));
1771		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1772		if (resolved_symbol.d >= 0)
1773		{
1774			int d = resolved_symbol.d;
1775			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1776			// It has nothing to do with alias.
1777			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1778				d = tensor_blocks[d].ref - 1;
1779			// Note we don't trace back on alias. This is intentional.
1780			assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 ==
 tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ (
{ if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1780, __extension__ __PRETTY_FUNCTION__
); }));
1781		}
1782	}
1783	if (sub_arena_out_tensors)
1784		ccfreefree(sub_arena_out_tensors);
1785	// Rewire sub arena's tensor references.
1786	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1787		if (tensor_arena->sub_arenas[i])
1788		{
1789			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1790			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1791			for (j = 0; j < node->input_size; j++)
1792			{
1793				const int idx = node->inputs[j];
1794				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1795				if (s_idx < 0)
1796					continue;
1797				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1798				// Only do the replacement if it is a multi-view tensor.
1799				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1800				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1801				{
1802					// This is binded tensor, bind it now.
1803					if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1804						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1805					else
1806						_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1807				}
1808			}
1809		}
1810	return tensor_arena;
1811}
1812 
1813static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1814{
1815	assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
 ; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1815, __extension__ __PRETTY_FUNCTION__); }));
1816	if ((intptr_t)graph == tensor_arena->graph_ref)
1817	{
1818		assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >=
 0 && pair_ref < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1818, __extension__ __PRETTY_FUNCTION__
); }));
1819		return tensor_arena->vt_tensors[pair_ref];
1820	}
1821	int i;
1822	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1823		if (tensor_arena->sub_arenas[i])
1824		{
1825			ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1826			if (tensor)
1827				return tensor;
1828		}
1829	return 0;
1830}
1831 
1832static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1833{
1834	if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1835		tensor->type |= CCV_TAPE_ALLOC;
1836	else {
1837		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1838		mv->type |= CCV_TAPE_ALLOC;
1839		int i;
1840		for (i = 0; i < mv->repeat + mv->kind; i++)
1841			_ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1842	}
1843}
1844 
1845static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1846{
1847	assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
 __assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1847, __extension__ __PRETTY_FUNCTION__
); }));
1848	int i;
1849	for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1850	{
1851		if (graph_prep->tensor_symbol_info[i].pair_ref)
1852		{
1853			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1854			// No need to continue check this if it is from its pair.
1855			continue;
1856		}
1857		if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1858		{
1859			// If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1860			if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
))
1861			{
1862				const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1863				if (vt_ref >= 0 &&
1864					TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc) == READ_ONLY)
1865					continue;
1866			}
1867			_ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1868		}
1869	}
1870	for (i = 0; i < graph_prep->sub_prep_size; i++)
1871		if (graph_prep->sub_preps[i])
1872			_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1873}
1874 
1875static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1876{
1877	int i, found = 0;
1878	// Try to insert head.
1879	ccv_array_t* head = tensor_blocks.head;
1880	assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
 else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1880, __extension__ __PRETTY_FUNCTION__); }));
1881	for (i = 0; i < head->rnum;)
1882	{
1883		const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i)));
1884		if (head_idx == idx)
1885		{
1886			found = 1;
1887			break;
1888		}
1889		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1890		if (cell.i32 && cell.i32[0] > 0)
1891		{
1892			/* If the current node is the parent of the head node, check if we found it or not. */
1893			/* If not found, replace the current one. */
1894			if (!found)
1895			{
1896				found = 1;
1897				*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = idx;
1898			} else {
1899				/* Remove the current one, change the rnum. */
1900				if (i < head->rnum - 1)
1901					*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(head->rnum - 1)));
1902				--head->rnum;
1903				continue;
1904			}
1905		} else {
1906			// If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1907			cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1908			if (cell.i32 && cell.i32[0] > 0)
1909			{
1910				found = 1;
1911				break;
1912			}
1913		}
1914		/* Advancing i. */
1915		++i;
1916	}
1917	/* If not found, push this idx to the end of the array. */
1918	if (!found)
1919		ccv_array_push(head, &idx);
1920	// Try to insert tail.
1921	found = 0;
1922	ccv_array_t* tail = tensor_blocks.tail;
1923	assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
 else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1923, __extension__ __PRETTY_FUNCTION__); }));
1924	for (i = 0; i < tail->rnum;)
1925	{
1926		const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i)));
1927		if (tail_idx == idx)
1928		{
1929			found = 1;
1930			break;
1931		}
1932		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1933		if (cell.i32 && cell.i32[0] > 0)
1934		{
1935			/* If the current node is the child of the tail node, check if we found it or not. */
1936			/* If not found, replace the current one. */
1937			if (!found)
1938			{
1939				found = 1;
1940				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = idx;
1941			} else {
1942				/* Remove the current one, change the rnum. */
1943				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(tail->rnum - 1)));
1944				--tail->rnum;
1945				continue;
1946			}
1947		} else {
1948			// If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1949			cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1950			if (cell.i32 && cell.i32[0] > 0)
1951			{
1952				found = 1;
1953				break;
1954			}
1955		}
1956		/* Advancing i. */
1957		++i;
1958	}
1959	/* If not found, push this idx to the end of the array. */
1960	if (!found)
1961		ccv_array_push(tail, &idx);
1962}
1963 
1964ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1965{
1966	if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1967	{
1968		assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
 0 && symbol.d < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1968, __extension__ __PRETTY_FUNCTION__
); }));
1969		ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1970		if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1971		{
1972			ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1973			while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1974				mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1975			return (ccv_nnc_tensor_t*)mv;
1976		}
1977		return tensor;
1978	}
1979	int i;
1980	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1981		if (tensor_arena->sub_arenas[i])
1982		{
1983			ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1984			if (tensor)
1985				return tensor;
1986		}
1987	return 0;
1988}
1989 
1990ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1991{
1992	if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1993	{
1994		assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
 >= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 1994, __extension__ __PRETTY_FUNCTION__
); }));
1995		return graph_exec_arena->graph_execs[symbol.d];
1996	}
1997	int i;
1998	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
1999		if (graph_exec_arena->sub_arenas[i])
2000		{
2001			ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
2002			if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
2003				return exec;
2004		}
2005	return (ccv_nnc_graph_exec_t){}; // 0.
2006}
2007 
2008ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2009{
2010	return graph_exec_arena->source;
2011}
2012 
2013ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2014{
2015	return graph_exec_arena->destination;
2016}
2017 
2018// Check whether the head is the beginning of this block.
2019static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
2020{
2021	assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
 ({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 2021, __extension__ __PRETTY_FUNCTION__
); }));
2022	return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0))) == head_node);
2023}
2024 
2025// Check whether the tail is the end of this block.
2026static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2027{
2028	assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
 ({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 2028, __extension__ __PRETTY_FUNCTION__
); }));
2029	return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0))) == tail_node);
2030}
2031 
2032// Make two tensor blocks one. Return 1 if that happened.
2033static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2034{
2035	// Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2036	if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
2037		(!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
2038		tensor_blocks[p_ref_0].tail->rnum == 1 &&
2039		tensor_blocks[p_ref_1].head->rnum == 1 &&
2040		tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
2041		*(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
) == *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
))
2042	{
2043		// If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2044		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 2044, __extension__ __PRETTY_FUNCTION__); }));
2045		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 2045, __extension__ __PRETTY_FUNCTION__); }));
2046		ccv_array_free(tensor_blocks[p_ref_0].tail);
2047		tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2048		if (tensor_blocks[p_ref_1].p_refs[0])
2049		{
2050			assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
 0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
 0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2050, __extension__ __PRETTY_FUNCTION__
); })); // It simply cannot have more than one p_refs, otherwise we cannot merge.
2051			if (!tensor_blocks[p_ref_0].p_refs[0])
2052				tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2053			else
2054				tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2055		}
2056		tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2057		TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
 & ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)));
2058		ccv_array_free(tensor_blocks[p_ref_1].head);
2059		if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
2060			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
 | UNFOLDABLE_AS_INPUT));
2061		// Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2062		TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
 & ~0x3) | UNASSIGNED));
2063		tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2064		if (!tensor_blocks[p_ref_0].r_refs)
2065			tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2066		ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2067		tensor_blocks[p_ref_1].size = 0;
2068		tensor_blocks[p_ref_1].head = 0;
2069		tensor_blocks[p_ref_1].tail = 0;
2070		return 1;
2071	}
2072	return 0;
2073}
2074 
2075static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2076{
2077	int i, j, k;
2078	// Generate exec dependencies (or, in other words, partial ordering of executions).
2079	ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2080	int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2081	int buf_size;
2082	if (p_node_info)
2083		{ assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
 if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2083, __extension__ __PRETTY_FUNCTION__
); })); }
2084#define for_block(x, val) \
2085	do { \
2086		if (((int32_t*)val)[0] > 0) \
2087		{ \
2088			buf[buf_size * 2] = x; \
2089			buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2090			++buf_size; \
2091		} \
2092	} while (0)
2093	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx; {
2094		buf_size = 0; /* save all its parent deps to this buffer */
2095		ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2096		if (vector)
2097			CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
2098		if (!node->outgoings)
2099			continue;
2100		for (i = 0; i < node->outgoings->rnum; i++)
2101		{
2102			int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2103			const int32_t one = 1;
2104			ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2105			/* If not found, set, if the current node is the destination node, no need 
2106			 * set itself as parent of subsequent nodes because its terminal nature. */
2107			if (!cell.i32 || cell.i32[0] == 0)
2108				ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2109			if (buf_size > 0)
2110			{
2111				ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2112				assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector
) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c"
, 2112, __extension__ __PRETTY_FUNCTION__); }));
2113				for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
2114				{
2115					ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2116					/* If not found, set */
2117					if (!cell.i32 || cell.i32[0] == 0)
2118						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2119					else {
2120						/* Otherwise, set to the longest one */
2121						int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
 + 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; });
2122						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2123					}
2124				}
2125			}
2126		}
2127	} ccv_nnc_graph_visit_endfor} }
2128#undef for_block
2129	ccfreefree(buf);
2130	// This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2131	const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2132	ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2133	// The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2134	// happens that I have to loop through all relevant node to find out if one is used or not.
2135	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2136		tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2137	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2138		for (i = 0; i < node->input_size; i++)
2139			if (node->inputs[i] >= 0)
2140			{
2141				tensor_blocks[node->inputs[i]].flags = 0;
2142				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2143				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2144				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2145					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2146					tensor_blocks[node->inputs[i]].pin_mem = 1;
2147			}
2148		for (i = 0; i < node->output_size; i++)
2149			if (node->outputs[i] >= 0)
2150			{
2151				tensor_blocks[node->outputs[i]].flags = 0;
2152				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2153				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2154				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2155					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2156					tensor_blocks[node->outputs[i]].pin_mem = 1;
2157			}
2158	} ccv_nnc_graph_visit_endfor} }
2159	if (p_node_info)
2160	{
2161		assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
 ({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 2161, __extension__ __PRETTY_FUNCTION__
); }));
2162		// Mark it as used if it is used in either input or output.
2163		for (i = 0; i < p_node_info->input_size; i++)
2164			if (p_node_info->inputs[i] >= 0)
2165			{
2166				const int d = p_node_info->inputs[i];
2167				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2168				{
2169					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2170					if (dd >= 0) // If this exists in this sub-graph, great.
2171						tensor_blocks[dd].flags = 0;
2172				}
2173			}
2174		for (i = 0; i < p_node_info->output_size; i++)
2175			if (p_node_info->outputs[i] >= 0)
2176			{
2177				const int d = p_node_info->outputs[i];
2178				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2179				{
2180					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2181					if (dd >= 0) // If this exists in this sub-graph, great.
2182						tensor_blocks[dd].flags = 0;
2183				}
2184			}
2185	}
2186	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2187		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2188		{
2189			// Check no tensor info is auto now.
2190			assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 2190, __extension__ __PRETTY_FUNCTION__
); }));
2191			// If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2192			// therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2193			// fold to).
2194			if (tensor_symbol_info[i].assign_ref)
2195			{
2196				// TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2197				// It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2198				// it kept its own representation, which is not the case for output).
2199				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2200				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2201				// But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2202				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT));
2203				// It also cannot be folded as output (except i), because we need to keep its own representation.
2204				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2205				assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
 == 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2205, __extension__ __PRETTY_FUNCTION__
); }));
2206				tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2207				for (j = 0; j < unroll_count; j++)
2208				{
2209					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT));
2210					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT));
2211				}
2212				if (tensor_blocks[assign_ref].bypass_ref)
2213				{
2214					// If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2215					tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2216					const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2217					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT));
2218					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2219					// On the other hand, it can be folded into the except_ref for the bypass_ref.
2220					tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2221					if (dup_tensor_from_ref)
2222					{
2223						const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2224						if (bypass_from_ref >= 0)
2225						{
2226							TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT));
2227							TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT));
2228							assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
 + unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
 - 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2228, __extension__ __PRETTY_FUNCTION__
); }));
2229							for (j = 0; j < unroll_count - 1; j++)
2230							{
2231								// Mark every incarnation as unfold-able.
2232								TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_INPUT));
2233								TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT));
2234							}
2235						}
2236					}
2237				}
2238			}
2239		}
2240	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2241	{
2242		// If it has a pair reference, we don't need to allocate this tensor at all,
2243		// set it to be unassigned.
2244		if (tensor_symbol_info[i].pair_ref)
2245			TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED));
2246		// If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2247		else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2248			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2249			TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2250			// For this case, there is no exception.
2251			tensor_blocks[i].unfoldable_except_ref = 0;
2252		} else if (tensor_symbol_info[i].p_ref) {
2253			assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 2253, __extension__ __PRETTY_FUNCTION__); }));
2254			const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2255			// If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2256			if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2257				// TODO: This check can be lifted if we can fold in the parent graph.
2258				if (-1 == p_ref_is_in_or_out)
2259					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2260			if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2261				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2262		}
2263	}
2264	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2265	{
2266		if (tensor_symbol_info[i].alias_ref)
2267		{
2268			const int ref = tensor_symbol_info[i].alias_ref - 1;
2269			// If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2270			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2271				tensor_blocks[ref].flags = 0;
2272			// An alias cannot ref to another alias.
2273			assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
 __assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2273, __extension__ __PRETTY_FUNCTION__); }));
2274			tensor_blocks[i].flags = ALIAS;
2275			tensor_blocks[i].ref = ref + 1; // Assign the ref.
2276			if (!tensor_blocks[ref].r_refs)
2277				tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2278			ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2279		}
2280	}
2281	// Scan again and if the ref is not assigned, mark the alias not assigned.
2282	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2283		if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2284		{
2285			const int ref = tensor_blocks[i].ref - 1;
2286			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2287			{
2288				// Mark this as unassigned.
2289				tensor_blocks[i].flags = UNASSIGNED;
2290				tensor_blocks[i].ref = 0;
2291			}
2292		}
2293	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2294	{
2295		// If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2296		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
2297		{
2298			tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2299			tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2300			// Cache tensor size (align to 16 bytes).
2301			tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2302		}
2303		// If there is a p_ref, add the one to the p_refs list.
2304		if (tensor_symbol_info[i].p_ref)
2305			tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2306	}
2307	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2308		for (i = 0; i < node->input_size; i++)
2309		{
2310			int d = node->inputs[i];
2311			if (d < 0)
2312				continue;
2313			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2314				d = tensor_symbol_info[d].alias_ref - 1;
2315			tensor_blocks[d].flags |= READ_ONLY;
2316			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2317				continue;
2318			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2318, __extension__ __PRETTY_FUNCTION__
); }));
2319			/* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2320			 * from the very beginning of the graph life-cycle and ends here. */
2321			if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
2322			{
2323				for (j = 0; j < source_size; j++)
2324				{
2325					// If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2326					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2327					if (cell.i32 && cell.i32[0] > 0)
2328						_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2329				}
2330				/* If this is a read-only (based on SSA, if first encountered as read), and this is
2331				 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2332				 * loop, however, in that case, you need to prevent read-only gets reused for the
2333				 * output tensor, which is not obvious how to implement correctly), and it is not
2334				 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2335				 * of memory anyway (because on second loop, we want to read the same value out).
2336				 * Mark it to the end of the graph. */
2337				if (p_node_info && !tensor_symbol_info[d].assign_ref)
2338					for (j = 0; j < destination_size; j++)
2339					{
2340						// If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2341						const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2342						if (cell.i32 && cell.i32[0] > 0)
2343							_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2344					}
2345			}
2346			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2347		}
2348		for (i = 0; i < node->output_size; i++)
2349		{
2350			int d = node->outputs[i];
2351			if (d < 0)
2352				continue;
2353			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2354				d = tensor_symbol_info[d].alias_ref - 1;
2355			tensor_blocks[d].flags |= WRITE_ONLY;
2356			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2357				continue;
2358			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2358, __extension__ __PRETTY_FUNCTION__
); }));
2359			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2360		}
2361	} ccv_nnc_graph_visit_endfor} }
2362	// For any assign_ref, its life-time kept until the end and wrap over.
2363	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2364		// If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2365		// that "somewhere else" need to keep its life-time til the end.
2366		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) &&
2367			p_node_info && tensor_symbol_info[i].assign_ref)
2368		{
2369			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2370			for (j = 0; j < destination_size; j++)
2371			{
2372				// This logic is to be more conservative about which destination we add to.
2373				// As of now, if we add everything, it is fine most likely. However, it may
2374				// cause issues in the future to do so naively. Thus, instead, we only add
2375				// the destination to it iff either the tensor is not used at all, or, the
2376				// destination is on the same stream as of the tensor block some way.
2377				int flag = !tensor_blocks[assign_ref].tail;
2378				for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2379				{
2380					const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
 + (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)));
2381					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2382					flag = (cell.i32 && cell.i32[0] > 0);
2383				}
2384				if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2385					_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2386			}
2387		}
2388	for (i = 0; i < output_size; i++)
2389	{
2390		assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
 __assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2390, __extension__ __PRETTY_FUNCTION__); }));
2391		int d = outputs[i].d;
2392		if (d < 0)
2393			continue;
2394		if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2395			d = tensor_symbol_info[d].alias_ref - 1;
2396		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2397			continue;
2398		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2398, __extension__ __PRETTY_FUNCTION__
); }));
2399		for (j = 0; j < destination_size; j++)
2400		{
2401			int flag = !tensor_blocks[d].tail;
2402			for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2403			{
2404				const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
2405				const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2406				flag = (cell.i32 && cell.i32[0] > 0);
2407			}
2408			if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2409				_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2410		}
2411	}
2412	// Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2413	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2414		int x, y;
2415		for (x = 0; x < node->input_size; x++)
2416			for (y = 0; y < node->output_size; y++)
2417				/* Some operations enforces some tensors to be the same for inputs / outputs. */
2418				if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2419				{
2420					// If both unassigned, it is fine.
2421					if (node->inputs[x] < 0 && node->outputs[y] < 0)
2422						continue;
2423					int ref = node->inputs[x];
2424					assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2424, __extension__ __PRETTY_FUNCTION__); }));
2425					while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2426						ref = tensor_blocks[ref].ref - 1;
2427					const int node_output_y = node->outputs[y];
2428					assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
 ({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2428, __extension__ __PRETTY_FUNCTION__
); }));
2429					// If both are not computable, it is fine, we don't need to enforce.
2430					if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2431						!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
)))
2432						continue;
2433					// Otherwise, enforce and error out if failed.
2434					if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2435						{ assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2435, __extension__ __PRETTY_FUNCTION__
); })); }
2436				}
2437	} ccv_nnc_graph_visit_endfor} }
2438	// Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2439	// we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2440	// that is not enforced in-place (because the tensor enforced in-place will be different than the
2441	// binding one).
2442	for (i = 0; i < tensor_bind_size; i++)
2443	{
2444		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2445		// If there is a tensor binded, then it is unassigned.
2446		if (resolved_symbol.d >= 0)
2447		{
2448			int d = resolved_symbol.d;
2449			// I cannot assert too much at this moment.
2450			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2451				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2452			// This check is for in-place ops. Only in-place op could have unassigned but ref.
2453			// It has nothing to do with alias.
2454			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2455				d = tensor_blocks[d].ref - 1;
2456			// Doesn't work if this is a loop carrying variable.
2457			assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
 __extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
 __assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2457, __extension__ __PRETTY_FUNCTION__); }));
2458			tensor_blocks[d].flags = UNASSIGNED;
2459			tensor_blocks[d].ref = 0; // No need to have ref as well.
2460		}
2461	}
2462	// Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2463	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2464		int x, y;
2465		for (x = 0; x < node->input_size; x++)
2466		{
2467			/* If the input is not assigned, it can be referenced, find the referenced one */
2468			int ref = node->inputs[x];
2469			if (ref < 0)
2470				continue;
2471			const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2472			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2473				ref = tensor_blocks[ref].ref - 1;
2474			assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
 ({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2474, __extension__ __PRETTY_FUNCTION__
); }));
2475			if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2476				tensor_blocks[ref].tail->rnum == 1)
2477			{
2478				for (y = 0; y < node->output_size; y++)
2479					/* Only proceed if the input symbol is different from the output symbol, */
2480					/* and the input symbol meets the output symbol exactly at the same spot. */
2481					if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2482						node->outputs[y] >= 0 &&
2483						ref != node->outputs[y] &&
2484						TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
 0x3) == UNASSIGNED)))
2485					{
2486						const int node_output_y = node->outputs[y];
2487						const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2488						/* If dimension matches perfectly, then we can assign y_symbol to x.
2489						 * If both of them are aliases, making sure their origin matches in size too. */
2490						if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
2491						{
2492							_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2493							// This refers to an alias itself, now mark it and will be processed later.
2494							if (ref != node->inputs[x])
2495								tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2496						}
2497					}
2498			}
2499		}
2500	} ccv_nnc_graph_visit_endfor} }
2501	// Specifically handle the bypass. This need to be done after the first pass.
2502	// I need to extend the bypass life-time to the same as the one I am going with.
2503	// It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2504	ccv_nnc_tensor_block_t empty_block = {};
2505	empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2506	empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2507	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2508		if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2509		{
2510			int can_bypass = 1;
2511			for (i = 0; can_bypass && i < node->output_size; i++)
2512			{
2513				int d = node->outputs[i];
2514				if (d < 0)
2515					continue;
2516				if (!tensor_blocks[d].bypass_ref)
2517					continue;
2518				while (tensor_blocks[d].ref)
2519					d = tensor_blocks[d].ref - 1;
2520				int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2521				while (tensor_blocks[bypass_ref].ref)
2522					bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2523				// If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2524				if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2525					continue;
2526				ccv_array_clear(empty_block.head);
2527				for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2528					ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
 + (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j))));
2529				ccv_array_clear(empty_block.tail);
2530				for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2531					ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
 + (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j))));
2532				for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2533					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), empty_block);
2534				for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2535					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), empty_block);
2536				// It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2537				assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
 ({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
 ("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2537, __extension__ __PRETTY_FUNCTION__
); }));
2538				int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2539				while (tensor_blocks[b_ref].ref)
2540					b_ref = tensor_blocks[b_ref].ref - 1;
2541				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2542				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2543				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2544				// even after we extend the life-time of bypass_ref. Then we are in a good shape.
2545				can_bypass = can_bypass && (a_hop_b || b_hop_a);
2546			}
2547			if (can_bypass)
2548			{
2549				for (i = 0; i < node->output_size; i++)
2550				{
2551					int d = node->outputs[i];
2552					if (d < 0)
2553						continue;
2554					if (!tensor_blocks[d].bypass_ref)
2555						continue;
2556					while (tensor_blocks[d].ref)
2557						d = tensor_blocks[d].ref - 1;
2558					int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2559					while (tensor_blocks[bypass_ref].ref)
2560						bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2561					// The bypass_ref can extend its life-time.
2562					for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2563						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2564					for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2565						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2566				}
2567			} else {
2568				for (i = 0; i < node->output_size; i++)
2569					tensor_blocks[node->outputs[i]].bypass_ref = 0;
2570				const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2571				// Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2572				exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2573			}
2574		}
2575	} ccv_nnc_graph_visit_endfor} }
2576	ccv_array_free(empty_block.head);
2577	ccv_array_free(empty_block.tail);
2578	*r_exec_dep = exec_dep;
2579	*r_tensor_blocks = tensor_blocks;
2580}
2581 
2582static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2583{
2584	if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2585	{
2586		ccv_nnc_cmd_t retval = cmd;
2587		retval.cmd = CCV_NNC_NOOP;
2588		return retval;
2589	}
2590	return cmd;
2591}
2592 
2593static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2594{
2595	if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2596	{
2597		if (tensor_symbol_info[input].alias_ref)
2598		{
2599			const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2600			assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2600, __extension__ __PRETTY_FUNCTION__
); }));
2601			ccv_nnc_tensor_symbol_t tensor_symbol = {};
2602			if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2603			{
2604				tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2605				if (tensor_symbol_info[alias_ref].pair_ref)
2606					ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2607						.d = tensor_symbol_info[alias_ref].pair_ref - 1,
2608						.graph = dup_graph->pair
2609					});
2610				ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2611				dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2612			} else {
2613				tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2614				tensor_symbol.graph = dup_graph;
2615			}
2616			ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2617			if (tensor_symbol_info[input].pair_ref)
2618				ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2619					.d = tensor_symbol_info[input].pair_ref - 1,
2620					.graph = dup_graph->pair
2621				});
2622			ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2623			dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2624		} else {
2625			ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2626			if (tensor_symbol_info[input].pair_ref)
2627				ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2628					.d = tensor_symbol_info[input].pair_ref - 1,
2629					.graph = dup_graph->pair
2630				});
2631			ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2632			dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2633		}
2634		if (tensor_symbol_info[input].bypass_ref)
2635		{
2636			const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2637			assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
 ({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2637, __extension__ __PRETTY_FUNCTION__
); }));
2638			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])));
2639			symbol_info->bypass_ref = dup_bypass_ref + 1;
2640		}
2641	}
2642	return (ccv_nnc_tensor_symbol_t) {
2643		.d = dup_tensor_block_ref[input * unroll_count],
2644		.graph = dup_graph,
2645	};
2646}
2647 
2648static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2649{
2650	int i;
2651	if (dup_exec_ref[idx * unroll_count] < 0)
2652	{
2653		// Input has to come before output, because output could has a bypass reference to the input.
2654		for (i = 0; i < node->input_size; i++)
2655			max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2656		for (i = 0; i < node->output_size; i++)
2657			max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2658		ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2659		dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2660	}
2661	return (ccv_nnc_graph_exec_symbol_t) {
2662		.d = dup_exec_ref[idx * unroll_count],
2663		.graph = dup_graph,
2664	};
2665}
2666 
2667static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2668{
2669	int i;
2670	for (i = 0; i < tensor_block_size; i++)
2671	{
2672		if (tensor_blocks[i].head)
2673			ccv_array_free(tensor_blocks[i].head);
2674		if (tensor_blocks[i].tail)
2675			ccv_array_free(tensor_blocks[i].tail);
2676		if (tensor_blocks[i].r_refs)
2677			ccv_array_free(tensor_blocks[i].r_refs);
2678		if (tensor_blocks[i].dup_p_refs)
2679			ccv_array_free(tensor_blocks[i].dup_p_refs);
2680	}
2681	ccfreefree(tensor_blocks);
2682}
2683 
2684// Find tensors that cannot be solved by co-allocating to the same location.
2685static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2686{
2687	int i, j, unroll_count = 0;
2688	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2689		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2690		{
2691			// This is is a parameter, thus, it has to be either an alias or used.
2692			assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
 & 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
 ("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2692, __extension__ __PRETTY_FUNCTION__
); }));
2693			const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2694			// The parameter it assign to has to be either an alias or used.
2695			assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2695, __extension__ __PRETTY_FUNCTION__
); }));
2696			// If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2697			// If it is the same, we are good, no need to extend.
2698			int a_ref = i;
2699			while (tensor_blocks[a_ref].ref)
2700				a_ref = tensor_blocks[a_ref].ref - 1;
2701			int b_ref = assign_ref;
2702			while (tensor_blocks[b_ref].ref)
2703				b_ref = tensor_blocks[b_ref].ref - 1;
2704			if (a_ref != b_ref)
2705			{
2706				// If any of the b's head is deterministically later than a's tail
2707				// or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2708				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2709				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2710				// It cannot be that both i can hop to j can j can hop to i.
2711				assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
 ? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
 > 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2711, __extension__ __PRETTY_FUNCTION__
); }));
2712				// Can it be folded
2713				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2714				if (a_hop_b || b_hop_a)
2715				{
2716					tensor_blocks[a_ref].companion_ref = b_ref + 1;
2717					tensor_blocks[b_ref].companion_ref = a_ref + 1;
2718					continue;
2719				}
2720				int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2721				for (j = 0; c_ref >= 0; j++)
2722				{
2723					while (tensor_blocks[c_ref].ref)
2724						c_ref = tensor_blocks[c_ref].ref - 1;
2725					c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2726				}
2727				unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
 = (j + 1); (_a > _b) ? _a : _b; });
2728			}
2729		}
2730	// Reset companion_ref if need to unroll.
2731	if (unroll_count)
2732		for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2733			tensor_blocks[j].companion_ref = 0;
2734	return unroll_count;
2735}
2736 
2737static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2738{
2739	int i, j, n;
2740	// The inout exec nodes, these are the nodes we are going to extend.
2741	uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2742	int max_input_size = 0;
2743	int max_output_size = 0;
2744	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2745	{
2746		max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; });
2747		max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; });
2748	}
2749	ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
2750	ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
2751	// Doing graph expansion
2752	// It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2753	assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2753, __extension__ __PRETTY_FUNCTION__
); }));
2754	assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2754, __extension__ __PRETTY_FUNCTION__
); }));
2755#define INCOMING_NODE (1)
2756#define OUTGOING_NODE (2)
2757	// Unroll the graph n times.
2758	for (n = 0; n < unroll_count; n++)
2759	{
2760		int* const dup_exec_ref = r_dup_exec_ref + n;
2761		const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2762		int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2763		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2764			dup_exec_ref[i * unroll_count] = -1;
2765		for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2766		{
2767			// If there is a assign_ref, that means I don't need to dup the tensor.
2768			if (tensor_symbol_info[i].assign_ref)
2769			{
2770				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2771				dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2772			} else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2773			// If this is a read-only tensor block, no need to duplicate because the value never changes
2774			// (note we handled assign_ref first), therefore, no need to generate duplicate.
2775				dup_tensor_block_ref[i * unroll_count] = i;
2776			else
2777				dup_tensor_block_ref[i * unroll_count] = -1;
2778		}
2779		// Go through the original graph, make copies of the node if it is inout.
2780		ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2781			ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2782			inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2783			if (!node->outgoings)
2784				continue;
2785			for (i = 0; i < node->outgoings->rnum; i++)
2786			{
2787				const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2788				inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2789				ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2790				ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2791			}
2792		} ccv_nnc_graph_visit_endfor} }
2793		// Check the visitor are all marked as either incoming or outgoing.
2794		const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2795		const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2796		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2797		{
2798			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2799				continue;
2800			assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
 OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
 INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
 ("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2800, __extension__ __PRETTY_FUNCTION__
); }));
2801			// If this is pure incoming nodes, then I need to concat this one with all original destination node
2802			if (inout[i] == INCOMING_NODE)
2803				for (j = 0; j < dup_destination_size; j++)
2804				{
2805					ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2806						.d = dup_destinations[j].d,
2807						.graph = dup_graph,
2808					}, (ccv_nnc_graph_exec_symbol_t) {
2809						.d = dup_exec_ref[i * unroll_count],
2810						.graph = dup_graph,
2811					});
2812				}
2813		}
2814		if (dup_graph->destinations)
2815			ccv_array_clear(dup_graph->destinations);
2816		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2817		{
2818			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2819				continue;
2820			const int d = dup_exec_ref[i * unroll_count];
2821			ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
 + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)));
2822			// If this has no outgoing node, add to the destination.
2823			if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2824				ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2825					.graph = dup_graph,
2826					.d = d,
2827				});
2828		}
2829	}
2830#undef INCOMING_NODE
2831#undef OUTGOING_NODE
2832	ccfreefree(inout);
2833}
2834 
2835static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2836{
2837	int i;
2838	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2839		// Now can assign them (The dup) as companion.
2840		// Get to the last one, which we will wrap over.
2841		if (dup_tensor_symbol_info[i].assign_ref)
2842		{
2843			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2844			dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2845			assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
 ; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2845, __extension__ __PRETTY_FUNCTION__
); }));
2846			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2847		}
2848}
2849 
2850// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2851// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2852// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2853static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2854{
2855	int i, j, k;
2856	for (i = 0; i < p_node_info->output_size; i++)
2857	{
2858		const int d = p_node_info->outputs[i];
2859		const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx))) - 1;
2860		if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED)))
2861			continue;
2862		for (k = 0; k < destination_size; k++)
2863			_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2864		// Add the duplicated destinations to the tensor_block_ref.
2865		for (j = 0; j < unroll_count; j++)
2866			for (k = 0; k < destination_size; k++)
2867			{
2868				const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2869				const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2870				if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2871					_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2872			}
2873	}
2874}
2875 
2876static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2877{
2878	int i, j;
2879	ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2880	ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2881	// blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2882	// Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2883	// No need to change anything, we are good.
2884	const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2885	if (!unroll_count)
2886		return;
2887	// Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2888	// Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2889	ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2890	int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2891	int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2892	_ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2893	ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2894	ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2895	ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
(dup_graph->exec_symbol_info->rnum) + _incoming_edges_ >
 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
 = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = {
 (dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1
; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for
 (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
 _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue
; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r =
 1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) {
 ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph
->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1
; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_
[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((
_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum
)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->destinations->rnum); _exist_size_[1] =
 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
 for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
 = 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
 int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
 6 && _d_ < (dup_graph->destinations->rnum))
 { _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
 < ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 6 && _d_ <
 (dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0
), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->
rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
 ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
 ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
 ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
 <= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
 ({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
2896	ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2897	_ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2898	// Free out the old exec_dep
2899	ccv_matrix_free(exec_dep);
2900	// and the tensor blocks, prepare for the new.
2901	_ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2902	// A reverse map to find where the original tensor comes from.
2903	int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2904	for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2905		dup_tensor_from_ref[i] = -1;
2906	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2907		for (j = 0; j < unroll_count; j++)
2908			if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2909				dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2910	int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2911	for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2912		dup_exec_from_ref[i] = -1;
2913	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2914	{
2915		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2916			continue;
2917		dup_exec_from_ref[i] = i; // Reference back.
2918		for (j = 0; j < unroll_count; j++)
2919			if (dup_exec_ref[i * unroll_count + j] >= 0)
2920				dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2921	}
2922	// Reset all attr.
2923	memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2924	_ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2925	ccv_nnc_graph_visit_free(dup_visit);
2926	ccfreefree(dup_exec_symbol_info);
2927	ccfreefree(dup_exec_from_ref);
2928	ccfreefree(dup_tensor_from_ref);
2929	// Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2930	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2931		// Loop over all possible duplications to assign dup_p_ref properly.
2932		for (j = 0; j < unroll_count; j++)
2933		{
2934			const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2935			if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2936			{
2937				const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2938				const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2939				if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2940				{
2941					if (!tensor_blocks[dup_idx].dup_p_refs)
2942						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2943					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2944				}
2945				if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2946					continue;
2947				const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2948				const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2949				if (p_ref_1_is_in_or_out == 1)
2950				{
2951					if (!tensor_blocks[dup_idx].dup_p_refs)
2952						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2953					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2954				}
2955			}
2956		}
2957	// companion_ref
2958	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2959		// Now can assign them (The dup) as companion.
2960		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2961		{
2962			// Get to the last one, which we will wrap over.
2963			const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2964			if (assign_ref >= 0)
2965			{
2966				int b_ref = assign_ref;
2967				while (tensor_blocks[b_ref].ref)
2968					b_ref = tensor_blocks[b_ref].ref - 1;
2969				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2970				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2971				// It cannot be that both i can hop to j can j can hop to i.
2972				// And it can be hop from one to another now after duplication.
2973				assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
 ({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
 ("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2973, __extension__ __PRETTY_FUNCTION__); }));
2974				tensor_blocks[i].companion_ref = b_ref + 1;
2975				tensor_blocks[b_ref].companion_ref = i + 1;
2976			}
2977		}
2978	ccfreefree(dup_tensor_symbol_info);
2979	// Extend the dup tensor block ref, prepare for future extensions.
2980	dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2981	for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2982		dup_tensor_block_ref[i] = -1;
2983	// Assign out changed properties.
2984	*r_exec_dep = exec_dep;
2985	*r_tensor_blocks = tensor_blocks;
2986	*r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2987	*r_dup_graph = dup_graph;
2988	*r_unroll_count = unroll_count;
2989	*r_dup_exec_ref = dup_exec_ref;
2990	*r_dup_tensor_block_ref = dup_tensor_block_ref;
2991}
2992 
2993static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2994{
2995	if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
2996		return tensor_block_size;
2997	int i;
2998	const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
2999	int found_idx = tensor_block_size;
3000	for (i = 0; i < anonymous_block_free_list_cap; i++)
3001	{
3002		const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)));
3003		assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
 ({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 3003, __extension__ __PRETTY_FUNCTION__
); }));
3004		// If the type doesn't match, ignore.
3005		if (tensor_blocks[idx].type != type)
3006			continue;
3007		// Heuristic about how to select the best tensor block to move forward.
3008		// If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
3009		if (tensor_blocks[idx].size >= size)
3010		{
3011			if (no_dup_p_refs)
3012				return idx;
3013			// Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
3014			// then we cannot do better than this, if that is the case, just return.
3015			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
3016				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
3017				return idx;
3018		}
3019		int64_t found_idx_size_diff;
3020		int64_t idx_size_diff;
3021		if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
3022			// Now, compare whether this one or the found_idx one is better.
3023			// At this point, there is no point of comparing the dup_p_refs, we only care about which one
3024			// is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
3025			(found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3026		{
3027			found_idx = idx;
3028			continue;
3029		}
3030		// No need to update if found_idx is better than idx.
3031		if (found_idx_size_diff > idx_size_diff)
3032			continue;
3033		// We bias towards the bigger one in case of similar.
3034		if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3035		{
3036			found_idx = idx;
3037			continue;
3038		}
3039		assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
 == tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 3039, __extension__ __PRETTY_FUNCTION__
); }));
3040		// On a tie, check which one has tighter life-cycle.
3041		if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3042		{
3043			// Check whether the current tensor blocks life-cycle is longer than the previous one.
3044			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3045				(!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3046				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3047				found_idx = idx;
3048			continue;
3049		}
3050		// Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3051		// We prefer to choose the one that has life-cycle closer to the expected ones.
3052		if (no_dup_p_refs)
3053		{
3054			// Whoever is shorter wins.
3055			if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3056				(!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3057				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3058				found_idx = idx;
3059			continue;
3060		}
3061		if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3062			continue;
3063		if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3064		{
3065			found_idx = idx;
3066			continue;
3067		}
3068		// If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3069		const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3070		const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3071		if (idx_after_request && found_idx_after_request)
3072		{
3073			if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3074				found_idx = idx;
3075			continue;
3076		} else {
3077			// We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3078			// If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3079			// Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3080			if (!found_idx_after_request && (idx_after_request ||
3081				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3082				found_idx = idx;
3083			continue;
3084		}
3085	}
3086	return found_idx;
3087}
3088 
3089static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3090{
3091	if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3092		return 0;
3093	int i, j, k;
3094	int input_size = 0;
3095	for (i = 0; i < p_node_info->p_while.input_size; i++)
3096		if (p_node_info->p_while.inputs[i] >= 0)
3097			++input_size;
3098	// If doesn't have tensor inputs (thus, only special inputs), just return.
3099	if (!input_size)
3100		return 0;
3101	ccv_nnc_tensor_symbol_t inputs[input_size];
3102	input_size = 0;
3103	for (i = 0; i < p_node_info->p_while.input_size; i++)
3104		if (p_node_info->p_while.inputs[i] >= 0)
3105			inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3106				.d = p_node_info->p_while.inputs[i],
3107				.graph = symbolic_graph,
3108			};
3109	assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
 1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
 > 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3109, __extension__ __PRETTY_FUNCTION__
); }));
3110	ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3111	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3112	for (i = 0; i < symbolic_graph->breakpoint_size; i++)
3113	{
3114		// Make a noop copy of the breakpoint, but with some tensor inputs.
3115		ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
3116		ccv_array_push(dup_breakpoints, &noop);
3117		// Connect this noop to the outgoing nodes of breakpoints.
3118		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(symbolic_graph->breakpoints[i].d)));
3119		if (symbol_info->outgoings)
3120			for (j = 0; j < symbol_info->outgoings->rnum; j++)
3121			{
3122				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3123				ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3124					.d = d,
3125					.graph = symbolic_graph,
3126				});
3127			}
3128	}
3129	for (i = 0; i < exec_symbol_info_size; i++)
3130	{
3131		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
3132		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
3133			continue;
3134		if (symbol_info->outgoings)
3135		{
3136			const int outgoing_size = symbol_info->outgoings->rnum;
3137			for (j = 0; j < outgoing_size; j++)
3138			{
3139				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3140				for (k = 0; k < symbolic_graph->breakpoint_size; k++)
3141					if (d == symbolic_graph->breakpoints[k].d)
3142					{
3143						ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)));
3144						ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3145							.d = i,
3146							.graph = symbolic_graph,
3147						}, noop);
3148						// Found, connected, exit.
3149						break;
3150					}
3151			}
3152		}
3153	}
3154	// Add the dup_breakpoints to source if neccessary.
3155	assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3155, __extension__ __PRETTY_FUNCTION__
); }));
3156	const int source_size = symbolic_graph->sources->rnum;
3157	for (i = 0; i < source_size; i++)
3158	{
3159		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i))))->d;
3160		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3161			if (d == symbolic_graph->breakpoints[j].d)
3162			{
3163				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3164				ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3165				// Found, made, exit.
3166				break;
3167			}
3168	}
3169	// Add the dup_breakpoints to destination if neccessary.
3170	assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3170, __extension__ __PRETTY_FUNCTION__); }));
3171	const int destination_size = symbolic_graph->destinations->rnum;
3172	for (i = 0; i < destination_size; i++)
3173	{
3174		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i))))->d;
3175		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3176			if (d == symbolic_graph->breakpoints[j].d)
3177			{
3178				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3179				ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3180				// Found, made, exit.
3181				break;
3182			}
3183	}
3184	return dup_breakpoints;
3185}
3186 
3187// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3188static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3189{
3190	assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3190, __extension__ __PRETTY_FUNCTION__
); }));
3191	assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
 ({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); }));
3192	// First, fill all the "auto" holes.
3193	// This is the symbol table that with "auto" info filled up.
3194	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3195	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3196	ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3197	ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3197, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3197, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3197, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
3198	ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3199	int i, j, k, p, q;
3200	const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
3201	ccv_sparse_matrix_t* exec_dep;
3202	ccv_nnc_tensor_block_t* tensor_blocks;
3203	_ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3204	int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3205	// Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3206	// are automatically filled in, and all the sub-graphs are processed.
3207	// There is a last step though, for a while loop, it is parameterized:
3208	// while (x > 5) {
3209	//     y = x + 1;
3210	// } (y => x) // This means after this loop is done, y's value will be copied over to x.
3211	// we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3212	// If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3213	// it is a inplace operation.
3214	// But if y cannot be x's alias, for example, this while loop looks like this:
3215	// while (x > 5) {
3216	//     y = x + a
3217	//     b = x + y
3218	// } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3219	// For this example, y cannot be x's alias because x is used later to compute b (and that computation
3220	// has dependency on y as well).
3221	// For this case, we need to modify the computation graph. Previously, the graph looks like this:
3222	// y = x + a -> b = x + y
3223	// This graph will be extended to look like this:
3224	// y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3225	// while (x0 > 5) {
3226	//     y0 = x0 + a0
3227	//     b0 = x0 + y0
3228	//     if (y0 > 5) break
3229	//     y1 = y0 + b0
3230	//     b1 = y0 + y1
3231	// } (y1 => x0, b1 => a0)
3232	// After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3233	// with each other now).
3234	// With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3235	// which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3236	ccv_nnc_symbolic_graph_t* dup_graph = 0;
3237	int* dup_exec_ref = 0;
3238	int* dup_tensor_block_ref = 0;
3239	int unroll_count = 0;
3240	// In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3241	ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
3242	prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3243	prep->flags = 0;
3244	// Cannot handle dup a node that is a graph as well.
3245	if (p_exec_symbol_info)
3246	{
3247		prep->flags = p_node_info->flags;
3248		if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3249		{
3250			_ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3251			_ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0))), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3252		} else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3253			// TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3254		}
3255	}
3256	ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
3257	ccv_array_t* anonymous_block_free_list = 0;
3258	const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3259	// Record whether this tensor is folded in this round.
3260	uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
3261	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
3262		for (p = 0; p < node->graph_ref_size; p++)
3263		{
3264			assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 3264, __extension__ __PRETTY_FUNCTION__); }));
3265			ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)));
3266			ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3267			ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0))), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0))), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3268			sub_prep->dup_breakpoints = dup_breakpoints;
3269			sub_prep->p = prep;
3270			sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1] = sub_prep;
3271			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3272			const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3273			for (i = 0; i < s_alloc_prep->block_size; i++)
3274			{
3275				const int block_ref = s_alloc_prep->blocks[i].block_ref;
3276				const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3277				if (block_ref < sub_prep->tensor_symbol_info_size)
3278				{
3279					// If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3280					// I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3281					if (s_tensor_blocks[block_ref].bypass_ref)
3282					{
3283						int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3284						while (s_tensor_blocks[bypass_ref].ref)
3285							bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3286						if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3287							s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3288							continue;
3289					}
3290					if (s_tensor_blocks[block_ref].p_refs[0])
3291					{
3292						/* If it is already properly assigned, next. */
3293						if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3294							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3295						{
3296							if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3297								s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3298							else {
3299								assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3299, __extension__ __PRETTY_FUNCTION__
); }));
3300								s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3301							}
3302						}
3303						/* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3304						if (s_tensor_blocks[block_ref].p_refs[1] &&
3305							s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3306							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3307						{
3308							assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3308, __extension__ __PRETTY_FUNCTION__
); }));
3309							assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3309, __extension__ __PRETTY_FUNCTION__
); }));
3310							s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3311						}
3312					}
3313				} else if (s_tensor_blocks[block_ref].dup_p_refs) {
3314					/* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3315					 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3316					 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3317					 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3318					 * its life-time to the end of the output tensor. */
3319					if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3320						s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3321					for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3322						ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j))));
3323				}
3324			}
3325		}
3326		const int init_tensor_block_size = tensor_block_size;
3327		int rw_anonymous_buffer_size_cap = 0;
3328		int ro_anonymous_buffer_size_cap = 0;
3329		if (anonymous_block_free_list)
3330			ccv_array_clear(anonymous_block_free_list);
3331		memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3332		for (p = 0; p < node->graph_ref_size; p++)
3333		{
3334			ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1];
3335			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3336			int rw_anonymous_buffer_size = 0;
3337			int ro_anonymous_buffer_size = 0;
3338			for (i = 0; i < s_alloc_prep->buffer_size; i++)
3339				if (s_alloc_prep->buffers[i].p_refs[0])
3340				{
3341					/* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3342					int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3343					/* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3344					int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3345					assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3345, __extension__ __PRETTY_FUNCTION__
); }));
3346					int unref_p_ref_0 = p_ref_0;
3347					while (tensor_blocks[unref_p_ref_0].ref)
3348						unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3349					/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3350					assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3350, __extension__ __PRETTY_FUNCTION__); }));
3351					if (s_alloc_prep->buffers[i].p_refs[1])
3352					{
3353						int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3354						const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3355						assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3355, __extension__ __PRETTY_FUNCTION__
); }));
3356						int unref_p_ref_1 = p_ref_1;
3357						while (tensor_blocks[unref_p_ref_1].ref)
3358							unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3359						/* See above comment for the similar p_ref_0 check. */
3360						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3360, __extension__ __PRETTY_FUNCTION__); }));
3361						assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3361, __extension__ __PRETTY_FUNCTION__
); }));
3362						int p_ref_t;
3363						if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3364						{
3365							CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
));
3366							CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t));
3367						}
3368						p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3369						/* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3370						if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
3371						{
3372							const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3373							if (folded)
3374							{
3375								p_ref_0 = p_ref_1;
3376								unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3377								tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3378								for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3379								{
3380									const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3381									assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3381, __extension__ __PRETTY_FUNCTION__
); }));
3382								}
3383							}
3384						}
3385					}
3386					/* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3387					 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3388					 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3389					 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3390					 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3391					 * associated with it, then we are good. */
3392					if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3393						(p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3394						(p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3395						TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3396					{
3397						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3398							{ assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3398, __extension__ __PRETTY_FUNCTION__
); })); }
3399						/* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3400						 * is a long argument why that is the case, the digest is, it is much easier to control your output
3401						 * than your input). */
3402						s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3403						s_alloc_prep->buffers[i].p_refs[1] = 0;
3404						/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3405						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3405, __extension__ __PRETTY_FUNCTION__); }));
3406						tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
 : _b; });
3407						for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3408							tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3409								tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3410									tensor_blocks[unref_p_ref_0].size;
3411					} else {
3412						s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3413						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3414							++ro_anonymous_buffer_size;
3415						else
3416							rw_anonymous_buffer_size += unroll_count + 1;
3417					}
3418				} else {
3419					if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3420						++ro_anonymous_buffer_size;
3421					else
3422						rw_anonymous_buffer_size += unroll_count + 1;
3423				}
3424			if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3425			{
3426				const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3427				// All read-write buffer (potentially) can be reused between each case..of branch.
3428				rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3429				// Read-only buffer cannot be reused between each case..of branch.
3430				ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3431				/* Anonymous block, allocate additional tensor blocks for this. */
3432				/* This is either because this is an internal tensor (don't have p_ref) */
3433				/* or it is an anonymous block itself within the sub graphs of this while graph. */
3434				tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3435				memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3436				if (dup_tensor_block_ref)
3437					dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3438				for (i = 0; i < s_alloc_prep->buffer_size; i++)
3439					if (!s_alloc_prep->buffers[i].p_refs[0])
3440					{
3441						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3442						{
3443							assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap"
, "ccv_nnc_symbolic_graph_compile.c", 3443, __extension__ __PRETTY_FUNCTION__
); }));
3444							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0x10) | ANONYMOUS));
3445							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3446							tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3447							tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3448							tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3449							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3450							tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3451							ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3452							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3453							if (dup_p_refs && dup_p_refs->rnum > 0)
3454							{
3455								for (j = 0; j < dup_p_refs->rnum; j++)
3456								{
3457									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3458									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3458, __extension__ __PRETTY_FUNCTION__
); }));
3459									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3459, __extension__ __PRETTY_FUNCTION__
); }));
3460									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3460, __extension__ __PRETTY_FUNCTION__); }));
3461									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3462									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3463									if (tensor_symbol_info[dup_p_ref].p_ref)
3464									{
3465										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3466										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3466, __extension__ __PRETTY_FUNCTION__); }));
3467										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3468										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3469										{
3470											if (!tensor_blocks[tensor_block_size].dup_p_refs)
3471												tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3472											ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3473										}
3474									}
3475									if (!tensor_blocks[tensor_block_size].tail)
3476										tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3477									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3478										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_size]);
3479								}
3480							} else {
3481								tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3482								ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3483							}
3484							for (j = 0; j < source_size; j++)
3485								_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3486							/* If this is a read-only (based on SSA, if first encountered as read), and this is
3487							 * sub-graph. Mark it to the end of the graph. */
3488							if (p_exec_symbol_info)
3489								for (j = 0; j < destination_size; j++)
3490									_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3491							/* If it is read-only, it is self-reflecting. */
3492							for (k = 0; k < unroll_count; k++)
3493							{
3494								for (j = 0; j < destination_size; j++)
3495									if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3496									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3497								/* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3498								assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
 ({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3498, __extension__ __PRETTY_FUNCTION__
); }));
3499								dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3500							}
3501							++tensor_block_size;
3502						} else {
3503							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3504							const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3505							const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3506							// Find suitable tensor block from the free list.
3507							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3508							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3509							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3510							if (new_anonymous_tensor_block)
3511							{
3512								tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3513								tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3514								tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3515								tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3516								ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3517							} else {
3518								tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3519								tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3520							}
3521							if (dup_p_refs && dup_p_refs->rnum > 0)
3522							{
3523								for (j = 0; j < dup_p_refs->rnum; j++)
3524								{
3525									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3526									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3526, __extension__ __PRETTY_FUNCTION__
); }));
3527									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3527, __extension__ __PRETTY_FUNCTION__
); }));
3528									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3529									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3530									if (tensor_symbol_info[dup_p_ref].p_ref)
3531									{
3532										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3533										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3533, __extension__ __PRETTY_FUNCTION__); }));
3534										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3535										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3536										{
3537											if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3538												tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3539											ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3540										}
3541									}
3542									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3542, __extension__ __PRETTY_FUNCTION__); }));
3543									if (!tensor_blocks[tensor_block_idx].tail)
3544										tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3545									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3546										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_idx]);
3547									// We have to add it to the warp around companion_ref as well.
3548									// TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3549									// be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3550									// definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3551									// gaurantee may be broken down in the line.
3552									if (tensor_blocks[dup_p_ref].companion_ref)
3553									{
3554										const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3555										for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3556											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3557										for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3558											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3559									}
3560								}
3561							} else if (new_anonymous_tensor_block) {
3562								tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3563								ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3564							}
3565							const int prev_tensor_block_idx = tensor_block_idx;
3566							if (new_anonymous_tensor_block)
3567							{
3568								if (!anonymous_block_free_list)
3569									anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3570								ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3571								++tensor_block_size;
3572							}
3573							for (k = 0; k < unroll_count; k++)
3574							{
3575								const int tensor_block_idx = new_anonymous_tensor_block ?
3576									(dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3577									dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3578								TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3579								TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3580								if (new_anonymous_tensor_block)
3581								{
3582									tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3583									tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3584									tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3585									tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3586									/* Attach to duplicated exec for this tensor block. */
3587									ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3588								} else {
3589									tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3590									tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3591									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3592 
3593								}
3594								if (dup_p_refs && dup_p_refs->rnum > 0)
3595								{
3596									/* Not nil, not self-reflecting. */
3597									for (j = 0; j < dup_p_refs->rnum; j++)
3598									{
3599										const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3600										assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3600, __extension__ __PRETTY_FUNCTION__
); }));
3601										assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3601, __extension__ __PRETTY_FUNCTION__
); }));
3602										// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3603										// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3604										if (tensor_symbol_info[dup_p_ref].p_ref)
3605										{
3606											const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3607											assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3607, __extension__ __PRETTY_FUNCTION__); }));
3608											const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3609											if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3610											{
3611												if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3612													tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3613												ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3614											}
3615										}
3616										assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
 ("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3616, __extension__ __PRETTY_FUNCTION__
); }));
3617										const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3618										assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
 __extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
 __assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3618, __extension__ __PRETTY_FUNCTION__); }));
3619										if (!tensor_blocks[tensor_block_idx].tail)
3620											tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3621										for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3622											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3623										// We have to add it to the warp around companion_ref as well.
3624										if (tensor_blocks[dup_dup_p_ref].companion_ref)
3625										{
3626											const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3627											for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3628												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3629											for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3630												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3631										}
3632									}
3633								} else if (new_anonymous_tensor_block) {
3634									tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3635									ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3636								}
3637								if (new_anonymous_tensor_block)
3638									++tensor_block_size;
3639							}
3640						}
3641					}
3642			}
3643		}
3644	} ccv_nnc_graph_visit_endfor} }
3645	if (anonymous_block_free_list)
3646		ccv_array_free(anonymous_block_free_list);
3647	ccfreefree(tensor_fold);
3648	// It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3649	// the allocation dependencies, thus, which tensor is reused to the existing tensor.
3650	ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3651	prep->while_count_tensor = 0;
3652	prep->dup_breakpoints = 0;
3653	prep->p = 0;
3654	prep->symbolic_graph = symbolic_graph;
3655	prep->p_idx = symbolic_graph->p_idx;
3656	prep->exec_idx = symbolic_graph->exec_idx;
3657	prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3658	prep->sub_preps = sub_preps;
3659	prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3660	prep->exec_symbol_info = exec_symbol_info;
3661	prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3662	prep->tensor_symbol_info = tensor_symbol_info;
3663	prep->unroll_count = unroll_count;
3664	prep->dup_tensor_block_ref = dup_tensor_block_ref;
3665	prep->tensor_block_size = tensor_block_size;
3666	prep->tensor_blocks = tensor_blocks;
3667	prep->exec_flags = exec_flags;
3668	prep->visit = visit;
3669	prep->alloc_prep = alloc_prep;
3670	if (dup_graph)
3671		ccv_nnc_symbolic_graph_free(dup_graph);
3672	if (dup_exec_ref)
3673		ccfreefree(dup_exec_ref);
3674	return prep;
3675}
3676 
3677static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3678{
3679	int i;
3680	_ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3681	ccfreefree(prep->exec_flags);
3682	for (i = 0; i < prep->sub_prep_size; i++)
3683		if (prep->sub_preps[i])
3684			_ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3685	if (prep->sub_preps)
3686		ccfreefree(prep->sub_preps);
3687	ccfreefree(prep->tensor_symbol_info);
3688	ccfreefree(prep->exec_symbol_info);
3689	if (prep->dup_tensor_block_ref)
3690		ccfreefree(prep->dup_tensor_block_ref);
3691	_ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3692	ccv_nnc_graph_visit_free(prep->visit);
3693	ccfreefree(prep);
3694}
3695 
3696static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3697{
3698	int i, j;
3699	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
3700		if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3701		{
3702			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3703			assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3703, __extension__ __PRETTY_FUNCTION__
); }));
3704			ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3705			for (i = 0; i < node->p_while.input_size; i++)
3706				if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3707				{
3708					ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3709					const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3710					for (j = 0; j < d; j++)
3711						prep = prep->p;
3712					prep->while_count_tensor = 1;
3713				}
3714		}
3715		for (i = 0; i < node->graph_ref_size; i++)
3716		{
3717			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3718			if (graph_ref >= 0)
3719				_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3720		}
3721	} ccv_nnc_graph_visit_endfor} }
3722}
3723 
3724static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3725{
3726	if (symbol >= 0)
3727		return graph_prep->tensor_arena->vt_tensors[symbol];
3728	if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3729		return 0;
3730	assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
 : 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
 0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3730, __extension__ __PRETTY_FUNCTION__
); }));
3731	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3732	int i;
3733	const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3734	for (i = 0; i < d; i++)
3735		prep = prep->p;
3736	assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
 ({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3736, __extension__ __PRETTY_FUNCTION__
); }));
3737	return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3738}
3739 
3740static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3741{
3742	int i;
3743	int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3744	ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3745	graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3746	graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3747	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3748	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3749		if (graph_execs[i].graph == graph)
3750			graph_execs[i].d = exec_cvt[graph_execs[i].d];
3751	ccfreefree(exec_cvt);
3752}
3753 
3754static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3755{
3756	int i, j, k;
3757	ccv_nnc_graph_t* const graph = graph_prep->graph;
3758	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3759	ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
3760	graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3761	graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3762	graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3763	graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3764	memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3765	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3766	int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3767	for (i = 0; i < exec_symbol_info_size; i++)
3768	{
3769		max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; });
3770		max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; });
3771		if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3772			max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
 : _b; });
3773		graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3774		graph_execs[i].graph = 0;
3775	}
3776	for (i = 0; i < graph_prep->sub_prep_size; i++)
3777		max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
 ((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; });
3778	ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
3779	ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
3780	ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })];
3781	const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3782	const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3783	// Create node, this is in topological order.
3784	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
3785		if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
3786		{
3787			for (i = 0; i < node->input_size; i++)
3788				max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3789			for (i = 0; i < node->output_size; i++)
3790				max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3791			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3792			{
3793				const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3794				assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3794, __extension__ __PRETTY_FUNCTION__
); }));
3795				ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3796				ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3797				graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3798				const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3799				ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3800				ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3801				for (i = 0; i < node->p_while.input_size; i++)
3802					max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3803				for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3804					max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3805				ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3806				_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3807			} else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3808				for (i = 0; i < node->output_size; i++)
3809					if (max_outputs[i] && max_outputs[i]->alias_ref)
3810						max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3811				graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3812				// Check whether this is already covered in the inputs, if not, need to be covered in the update.
3813				for (i = 0; i < node->case_of.argument.offset; i++)
3814				{
3815					ccv_nnc_tensor_t* const update = max_inputs[i];
3816					if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3817						continue;
3818					int flag = 0;
3819					for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3820						flag = (update == max_inputs[j]);
3821					if (!flag)
3822						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3823				}
3824				const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3825				ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3826				if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3827				{
3828					// Add another graph for data transfer.
3829					ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3830					for (i = 0; i < node->output_size; i++)
3831						max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3832					ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }), max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }));
3833					ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3834					ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3835					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3836					int exec_cvt;
3837					ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3838				}
3839				for (i = 0; i < node->graph_ref_size; i++)
3840				{
3841					const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3842					if (graph_ref < 0)
3843						continue;
3844					ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3845					const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3846					ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3847					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3848					_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3849				}
3850			} else {
3851				graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3852			}
3853			ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3854		}
3855	} ccv_nnc_graph_visit_endfor} }
3856	// Then connect them.
3857	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
3858		if (node->outgoings)
3859			for (i = 0; i < node->outgoings->rnum; i++)
3860			{
3861				const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
3862				if (graph_execs[outgoing].graph)
3863					ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3864			}
3865	} ccv_nnc_graph_visit_endfor} }
3866	int source_exec_created = 0;
3867	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3868	const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3869	ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3870	// After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3871	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
3872	{
3873		if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
3874		{
3875			int ref = i;
3876			while (tensor_symbol_info[ref].alias_ref)
3877				ref = tensor_symbol_info[ref].alias_ref - 1;
3878			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
3879				ref = tensor_blocks[ref].ref - 1;
3880			// This is not computable. It could be that we marked a const tensor as init zero.
3881			if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)))
3882				continue;
3883			// If this tensor is not used by any exec, we don't need to init at all. Skip.
3884			if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3885				continue;
3886			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3887			// Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3888			ccv_nnc_graph_exec_t set_exec;
3889			if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3890				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={0,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3891			else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3892				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={1,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3893			for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
3894			{
3895				const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)));
3896				if (outgoing >= exec_symbol_info_size)
3897					continue;
3898				assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
 if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3898, __extension__ __PRETTY_FUNCTION__
); }));
3899				assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3899, __extension__ __PRETTY_FUNCTION__
); }));
3900				ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3901			}
3902			int flags = 0;
3903			if (alloc_dep[ref])
3904				for (j = 0; j < alloc_dep[ref]->rnum; j++)
3905				{
3906					const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)));
3907					// This is from alloc_dep, it should be computable.
3908					assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3908, __extension__ __PRETTY_FUNCTION__
); }));
3909					if (tensor_blocks[d].tail)
3910						for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
3911						{
3912							const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
3913							if (incoming >= exec_symbol_info_size)
3914								continue;
3915							assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
 if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3915, __extension__ __PRETTY_FUNCTION__
); }));
3916							assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3916, __extension__ __PRETTY_FUNCTION__
); }));
3917							ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3918							flags = 1;
3919						}
3920				}
3921			// If cannot find a start node for this exec, we need to append it to the no-op of the start.
3922			if (!flags)
3923			{
3924				if (!source_exec_created)
3925				{
3926					graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3927					source_exec_created = 1;
3928				}
3929				ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3930			}
3931		}
3932	}
3933	// Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3934	// (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3935	// with its alias).
3936	assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3936, __extension__ __PRETTY_FUNCTION__
); }));
3937	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3938	{
3939		ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3940		// If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3941		if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3942		{
3943			const ccv_array_t* const head = tensor_blocks[i].head;
3944			if (head && head->rnum > 0)
3945				for (j = 0; j < head->rnum; j++)
3946				{
3947					const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(j)));
3948					if (idx >= exec_symbol_info_size)
3949						continue;
3950					assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3950, __extension__ __PRETTY_FUNCTION__); }));
3951					const int d = graph_execs[idx].d;
3952					ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)));
3953					int flag = 0;
3954					if (exec_info->tensor_wraps_ref)
3955					{
3956						ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)));
3957						for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3958							flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3959					}
3960					// If none is in the flag, it need to be included in the cast.
3961					if (!flag)
3962						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3963				}
3964		}
3965	}
3966	// Create source / destination phony node. This is to facilitate use of compiled graph.
3967	// Also, this is needed if you have init zero execs.
3968	if (source_exec_created || source_size > 1)
3969	{
3970		if (!source_exec_created)
3971			graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3972		for (i = 0; i < source_size; i++)
3973			ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3974	} else {
3975		assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
 ({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3975, __extension__ __PRETTY_FUNCTION__
); }));
3976		assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
 if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3976, __extension__ __PRETTY_FUNCTION__
); }));
3977		graph_exec_arena->source = graph_execs[sources[0].d];
3978	}
3979	if (destination_size == 1)
3980		graph_exec_arena->destination = graph_execs[destinations[0].d];
3981	else {
3982		graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3983		for (i = 0; i < destination_size; i++)
3984			ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3985	}
3986	ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3987	ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3988	return graph_exec_arena;
3989}
3990 
3991static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3992{
3993	if (graph_prep->symbolic_graph == pair)
3994		return graph_prep->graph;
3995	int i;
3996	for (i = 0; i < graph_prep->sub_prep_size; i++)
3997		if (graph_prep->sub_preps[i])
3998		{
3999			ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
4000			if (graph)
4001				return graph;
4002		}
4003	return 0;
4004}
4005 
4006static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4007{
4008	int i;
4009	for (i = 0; i < graph_prep->sub_prep_size; i++)
4010		if (graph_prep->sub_preps[i])
4011		{
4012			if (graph_prep->sub_preps[i]->symbolic_graph->pair)
4013				graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
4014		}
4015}
4016 
4017static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4018{
4019	assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4019, __extension__ __PRETTY_FUNCTION__
); }));
4020	int i;
4021	for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
4022	{
4023		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
))
4024			continue;
4025		if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref)
4026		{
4027			ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4028				.d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4029				.graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph,
4030			});
4031			if (pair_exec.d >= 0)
4032				ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4033		}
4034	}
4035	for (i = 0; i < graph_prep->sub_prep_size; i++)
4036		if (graph_prep->sub_preps[i])
4037			_ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4038}
4039 
4040static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4041{
4042	int i;
4043	if (graph_prep->dup_breakpoints)
4044	{
4045		// Strip the const modifier only possible because it is a sub-graph.
4046		ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4047		for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
4048			ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
 + (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i))));
4049		ccv_array_free(graph_prep->dup_breakpoints);
4050		graph_prep->dup_breakpoints = 0;
4051		graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4052		// Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4053		memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4054		// Since exec_symbol_info changed, create a new visit object.
4055		assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); }));
4056		assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 4056, __extension__ __PRETTY_FUNCTION__); }));
4057		ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)));
4058		const int source_size = symbolic_graph->sources->rnum;
4059		ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)));
4060		const int destination_size = symbolic_graph->destinations->rnum;
4061		ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4061, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4061, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4061, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
4062		ccv_nnc_graph_visit_free(graph_prep->visit);
4063		graph_prep->visit = visit;
4064		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 4064, __extension__ __PRETTY_FUNCTION__
); }));
4065		ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4066	}
4067	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
4068		for (i = 0; i < node->graph_ref_size; i++)
4069		{
4070			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
4071			if (graph_ref >= 0)
4072				_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4073		}
4074	} ccv_nnc_graph_visit_endfor} }
4075}
4076 
4077const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4078 
4079void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4080{
4081	assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 4081, __extension__ __PRETTY_FUNCTION__); }));
4082	assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
 if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4082, __extension__ __PRETTY_FUNCTION__
); }));
4083	assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
 ({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4083, __extension__ __PRETTY_FUNCTION__
); }));
4084	int i;
4085	// Cannot bind the multi-view.
4086	for (i = 0; i < tensor_bind_size; i++)
4087	{
4088		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 4088, __extension__ __PRETTY_FUNCTION__
); }));
4089		assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 4089, __extension__ __PRETTY_FUNCTION__
); }));
4090	}
4091	ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4092	_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4093	ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4094	_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4095	*tensor_arena_ref = tensor_arena;
4096	// The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4097	_ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4098	// Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4099	_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4100	*graph_ref = graph_prep->graph;
4101	ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4102	_ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4103	_ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4104	*graph_exec_arena_ref = graph_exec_arena;
4105	_ccv_nnc_symbolic_graph_prep_free(graph_prep);
4106}
4107 
4108static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4109{
4110	// Buffers are inherited from above, no need to dealloc.
4111	int i;
4112	for (i = 0; i < tensor_arena->sub_arena_size; i++)
4113		if (tensor_arena->sub_arenas[i])
4114			_ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4115	for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
4116	{
4117		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
 (size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i))));
4118		assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 4118, __extension__ __PRETTY_FUNCTION__
); }));
4119		ccv_nnc_tensor_multiview_free(*mv);
4120	}
4121	ccv_array_free(tensor_arena->tensor_metadata);
4122	ccv_array_free(tensor_arena->m_tensor_idx);
4123	if (tensor_arena->pb_vt_tensors)
4124		ccfreefree(tensor_arena->pb_vt_tensors);
4125	if (tensor_arena->vt_alias_r_refs_p)
4126		ccfreefree(tensor_arena->vt_alias_r_refs_p);
4127	if (tensor_arena->vt_sizes)
4128		ccfreefree(tensor_arena->vt_sizes);
4129	ccfreefree(tensor_arena);
4130}
4131 
4132void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4133{
4134	assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
 == (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 4134, __extension__ __PRETTY_FUNCTION__
); }));
1
Assuming field 'graph_ref' is equal to field 'graph'→
2
←
Taking true branch→
4135	assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4135, __extension__ __PRETTY_FUNCTION__
); }));
3
←
Assuming field 'd' is < field 'vt_tensor_size'→
4
←
Taking true branch→
4136	assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({
 if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 4136, __extension__ __PRETTY_FUNCTION__
); }));
5
←
Assuming field 'd' is >= 0→
6
←
Taking true branch→
4137	// Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4138	int i;
4139	if (!tensor_arena->pb_vt_tensors)
7
←
Assuming field 'pb_vt_tensors' is null→
8
←
Taking true branch→
4140	{
4141		tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4142		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
9
←
Assuming 'i' is < field 'vt_tensor_size'→
10
←
Loop condition is true.  Entering loop body→
13
←
Assuming 'i' is >= field 'vt_tensor_size'→
14
←
Loop condition is false. Execution continues on line 4146→
4143			if (tensor_arena->vt_tensors[i])
11
←
Assuming pointer value is null→
12
←
Taking false branch→
4144				tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4145	}
4146	if (!tensor_arena->vt_alias_r_refs_p)
15
←
Assuming field 'vt_alias_r_refs_p' is non-null→
4147	{
4148		tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4149		tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4150		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4151			if (tensor_arena->vt_alias_refs[i])
4152			{
4153				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4154				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4154, __extension__ __PRETTY_FUNCTION__
); }));
4155				++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4156			}
4157		int refp = 0;
4158		for (i = 0; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4159			if (tensor_arena->vt_alias_r_refs_p[i])
4160				refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4161			else
4162				tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4163		for (i = refp; i < tensor_arena->vt_tensor_size; i++)
4164			tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4165		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4166			if (tensor_arena->vt_alias_refs[i])
4167			{
4168				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4169				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4169, __extension__ __PRETTY_FUNCTION__
); }));
4170				const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4171				assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if (
pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 4171, __extension__ __PRETTY_FUNCTION__); }));
4172				tensor_arena->vt_alias_r_refs[pos] = i;
4173			}
4174	}
4175	const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d;
16
←
Taking false branch→
17
←
Assuming the condition is true→
18
←
'?' condition is true→
4176	if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
19
←
Assuming the condition is false→
20
←
Taking false branch→
4177	{
4178		assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor
)->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4178, __extension__ __PRETTY_FUNCTION__
); })); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4179		assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4181, __extension__ __PRETTY_FUNCTION__
); }))
4180					ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4181, __extension__ __PRETTY_FUNCTION__
); }))
4181				(size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4181, __extension__ __PRETTY_FUNCTION__
); }));
4182	} else
4183		{ assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4183, __extension__ __PRETTY_FUNCTION__
); })); }
21
←
Assuming the condition is true→
22
←
Taking true branch→
4184	if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW
))
23
←
Dereference of null pointer
4185		{ assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors
[symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t
*)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else
 __assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4185, __extension__ __PRETTY_FUNCTION__
); })); }
4186	tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4187	if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4188		for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++)
4189		{
4190			const int d = tensor_arena->vt_alias_r_refs[i];
4191			if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it.
4192				break;
4193			ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4194			d_tensor->info.datatype = tensor->info.datatype;
4195			d_tensor->info.reserved = tensor->info.reserved;
4196			if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW))
4197				ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4198			else {
4199				d_tensor->data.u8 = tensor->data.u8;
4200				d_tensor->dataof = tensor->dataof;
4201			}
4202		}
4203}
4204 
4205void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4206{
4207	if (!tensor_arena->pb_vt_tensors)
4208		return;
4209	int i;
4210	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4211		if (tensor_arena->vt_tensors[i])
4212			tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4213}
4214 
4215uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4216{
4217	uint64_t total_size = 0;
4218	int i;
4219	for (i = 0; i < tensor_arena->buffer_size; i++)
4220		total_size += tensor_arena->buffers[i].size;
4221	return total_size;
4222}
4223 
4224static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4225{
4226	int i;
4227	if (mv->it)
4228		mv->it->info = params;
4229	for (i = 0; i < mv->repeat + mv->kind; i++)
4230	{
4231		ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
4232		if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4233			_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4234		else
4235			tensor->info = params;
4236	}
4237}
4238 
4239int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4240{
4241	int i;
4242	assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph->
tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size
) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4242, __extension__ __PRETTY_FUNCTION__
); }));
4243	if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4244	{
4245		tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4246		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4247			if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4248			{
4249				ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4250				if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4251				{
4252					ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4253					while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
4254						mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
4255					tensor = (ccv_nnc_tensor_t*)mv;
4256				}
4257				tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4258			}
4259	}
4260	int flag = 0;
4261	for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++)
4262		if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4263		{
4264			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4265			ccv_nnc_tensor_param_t params = symbol_info->info;
4266			params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4267			params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4268			flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4269		}
4270	if (flag)
4271		return -1;
4272	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4273		if (tensor_arena->vt_tensors[i])
4274		{
4275			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4276			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4277			if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4278			{
4279				assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0)
, __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else
 __assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c"
, 4279, __extension__ __PRETTY_FUNCTION__); }));
4280				_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4281			} else if (!tensor_arena->vt_alias_refs[i]) {
4282				ccv_nnc_tensor_param_t params = symbol_info->info;
4283				params.datatype = tensor->info.datatype;
4284				params.reserved = tensor->info.reserved;
4285				tensor->info = params;
4286			} else {
4287				off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4288				ccv_nnc_tensor_param_t params = symbol_info->info;
4289				params.datatype = tensor->info.datatype;
4290				params.reserved = tensor->info.reserved;
4291				tensor->info = params;
4292				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4293				ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4294				if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4295				{
4296					((ccv_nnc_tensor_view_t*)tensor)->off = off;
4297					memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4298				}
4299			}
4300		}
4301	// Should handle sub_tensor_arena, don't do that at the moment.
4302	assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs"
, "ccv_nnc_symbolic_graph_compile.c", 4302, __extension__ __PRETTY_FUNCTION__
); }));
4303	return 0;
4304}
4305 
4306void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4307{
4308	assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum
 >= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__
 ({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena
->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 4308, __extension__ __PRETTY_FUNCTION__
); }));
4309	int i;
4310	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
4311	{
4312		const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4313		if (graph_exec.d < 0)
4314			continue;
4315		const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4316		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
4317		ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4318		if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4319		{
4320			new_cmd.backend = existing_cmd.backend;
4321			new_cmd.algorithm = existing_cmd.algorithm;
4322		}
4323		ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4324	}
4325}
4326 
4327void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4328{
4329	int i;
4330	for (i = 0; i < tensor_arena->buffer_size; i++)
4331	{
4332		if (!tensor_arena->buffers[i].ptr)
4333			continue;
4334		const int buffer_type = tensor_arena->buffers[i].type;;
4335		const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
4336#ifdef HAVE_CUDA1
4337		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4338		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4339		{
4340			if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4341				tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4342			else
4343				cufree(device_id, tensor_arena->buffers[i].ptr);
4344		} else {
4345			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4345, __extension__ __PRETTY_FUNCTION__
); }));
4346			if (tensor_arena->buffers[i].pin_mem)
4347				cuhostfree(tensor_arena->buffers[i].ptr);
4348			else
4349				ccfreefree(tensor_arena->buffers[i].ptr);
4350		}
4351#elif defined(HAVE_MPS)
4352		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4353		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4354		{
4355			// if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4356			// 	tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4357			// else
4358			mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4359		} else {
4360			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4360, __extension__ __PRETTY_FUNCTION__
); }));
4361			ccfreefree(tensor_arena->buffers[i].ptr);
4362		}
4363#else
4364		assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4364, __extension__ __PRETTY_FUNCTION__
); }));
4365		ccfreefree(tensor_arena->buffers[i].ptr);
4366#endif
4367		tensor_arena->buffers[i].ptr = 0;
4368	}
4369	// For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4370	if (tensor_arena->disposers)
4371	{
4372		for (i = 0; i < tensor_arena->disposers->rnum; i++)
4373		{
4374			ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + (
size_t)(tensor_arena->disposers)->rsize * (size_t)(i)));
4375			disposer->dispose(disposer->ptr, disposer->userdata);
4376		}
4377		ccv_array_free(tensor_arena->disposers);
4378		tensor_arena->disposers = 0;
4379	}
4380}
4381 
4382void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4383{
4384	ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4385	_ccv_nnc_tensor_arena_free(tensor_arena);
4386}
4387 
4388void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4389{
4390	int i;
4391	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
4392		if (graph_exec_arena->sub_arenas[i])
4393			ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4394	ccfreefree(graph_exec_arena);
4395}