ccv_nnc_symbolic_graph

Bug Summary

File:	nnc/ccv_nnc_symbolic_graph_compile.c
Warning:	line 3794, column 7 The left operand of '==' is a garbage value
Annotated Source Code

Press '?' to see keyboard shortcuts
Show analyzer invocation
clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2026-05-17-061038-2658378-1 -x c ccv_nnc_symbolic_graph_compile.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "ccv_nnc_easy.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include "_ccv_nnc_graph.h"
11#include "_ccv_nnc_symbolic_graph.h"
12 
13// MARK - Level-3 API
14 
15typedef struct {
16	int flags;
17	int type;
18	int pin_mem; // This memory need to be pinned.
19	int ref; // Reference to another tensor block. Start with 1.
20	int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21	int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22	int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23	int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24	ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25	uint64_t size; // The size of the tensor expected.
26	int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27	ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28	ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29	ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30} ccv_nnc_tensor_block_t; // Tensor Arena Block
31 
32#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
33 
34enum {
35	UNASSIGNED = 0x1,
36	ALIAS = 0x2,
37	READ_ONLY = 0x4,
38	WRITE_ONLY = 0x8,
39	READ_WRITE = 0xc,
40	ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41	UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42	UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43};
44 
45#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
46#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
47#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
48#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
50#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
51#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED)) (!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
52#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
53#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
54#define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
56#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
58#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
60 
61#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
 & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
62 
63// Holds additional information about the exe nodes.
64typedef struct {
65	int flags;
66} ccv_nnc_graph_exec_flag_t;
67 
68enum {
69	CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70};
71 
72typedef struct {
73	int index;
74	int oc;
75	int type;
76	uint64_t size;
77} ccv_nnc_tensor_opt_t;
78 
79// We first sort the same type together (because they won't be reused at all.
80// And then we sort by size, after that, sort by oc.
81#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
 *array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
 t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
83#undef more_than
84typedef struct {
85	int idx;
86	int hop;
87} ccv_nnc_tensor_hop_t;
88#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t
 total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t
 t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t
* left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t
* ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && less_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t
* left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0
; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t
* a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux
) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b
, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? (
less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a)
) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = less_than
(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, *
c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than
(*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !less_than(*pivot, *left
, aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !less_than(*right, *pivot, aux) ) { if( !less_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
90#undef less_than
91 
92// If b has items overlap with a, a is still after b (inclusive).
93static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94{
95	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__
); }));
96	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__
); }));
97	int x, y;
98	for (x = 0; x < b->rnum; x++)
99	{
100		const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)));
101		int flag = 0;
102		// In extreme cases where a is a superset of b, then a is still after b, we are good.
103		for (y = 0; !flag && y < a->rnum; y++)
104		{
105			const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)));
106			flag = (p == q);
107		}
108		if (!flag)
109			for (y = 0; y < a->rnum; y++)
110			{
111				ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y))), p);
112				if (!cell.i32 || cell.i32[0] == 0)
113					return 0;
114			}
115	}
116	// If b->rnum == 0, a is after b for sure.
117	// Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118	// if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119	return (a->rnum > 0 || b->rnum == 0);
120}
121 
122static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123{
124	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__
 __PRETTY_FUNCTION__); }));
125	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__
 __PRETTY_FUNCTION__); }));
126	if (!a->rnum || !b->rnum)
127		return 0;
128	int x, y, max_hop = 0;
129	for (x = 0; x < a->rnum; x++)
130	{
131		ccv_sparse_matrix_vector_t* const vector = ccv_get_sparse_matrix_vector(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x))));
132		if (!vector)
133			return 0;
134		for (y = 0; y < b->rnum; y++)
135		{
136			const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y))));
137			if (!cell.i32 || cell.i32[0] == 0)
138				return 0;
139			if (cell.i32[0] > max_hop)
140				max_hop = cell.i32[0];
141		}
142	}
143	// We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
144	// The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
145	return max_hop;
146}
147 
148// If every a's head is deterministically after b's tail
149static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
150{
151	return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
152}
153 
154typedef struct {
155	ccv_array_t** alloc_dep;
156	int vt_block_size;
157	int buffer_size;
158	int block_size;
159	int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
160	struct {
161		int type; // The type from tensor blocks.
162		int pin_mem; // Whether this is pinned memory.
163		int flags; // The flags (currently for READ_ONLY or not).
164		uint64_t size; // The size of the buffer allocated.
165		int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
166		ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
167	}* buffers;
168	struct {
169		int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
170		int block_ref; // A reference to which block in the given tensor_block to use.
171		uint64_t offset; // The offset of this block.
172	}* blocks;
173} ccv_nnc_tensor_alloc_prep_t;
174 
175typedef struct ccv_nnc_symbolic_graph_prep_s {
176	int flags;
177	int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
178	int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
179	int exec_idx;
180	int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
181	int tensor_symbol_info_size;
182	int exec_symbol_info_size;
183	int tensor_block_size;
184	int sub_prep_size;
185	ccv_nnc_tensor_block_t* tensor_blocks;
186	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
187	ccv_nnc_graph_exec_flag_t* exec_flags;
188	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
189	int* dup_tensor_block_ref;
190	ccv_nnc_graph_visit_t* visit;
191	ccv_nnc_tensor_alloc_prep_t* alloc_prep;
192	struct ccv_nnc_symbolic_graph_prep_s* p;
193	struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
194	// Structures that don't require to be freed after deallocation.
195	const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
196	ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
197	ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
198	ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
199} ccv_nnc_symbolic_graph_prep_t;
200 
201typedef struct {
202	int oc;
203	ccv_array_t* itf;
204} ccv_nnc_tensor_block_adjacent_t;
205 
206static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
207{
208	// Compute how many dis-continuous buffers are needed.
209	// We prefer to have several dis-continuous buffers instead of one big buffer because
210	// in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
211	// to fully utilize memory.
212	int i, j, k;
213	ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
214	int allocable_tensor_size = 0, available_tensor_size = 0;
215	for (i = 0; i < tensor_block_size; i++)
216		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
217		{
218			// Tensors that we need the header info.
219			++available_tensor_size;
220			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
221				// Tensors that we actually need to allocate (exclude the alias).
222				++allocable_tensor_size;
223		}
224	ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
225	ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
226	ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
227	// Overlap count.
228	for (i = 0; i < tensor_block_size; i++)
229		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
230			for (j = i + 1; j < tensor_block_size; j++)
231				if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED)))
232				{
233					// We only reuse buffers within the same memory type. The tensor_dt / tensor_df
234					// matrices are only queried later for same-type candidates in this function,
235					// thus cross-type hop relations are not needed for allocation planning here.
236					if (tensor_blocks[i].type != tensor_blocks[j].type)
237						continue;
238					// Check to see if they interfere (default to yes).
239					// If any of the i's head is deterministically later than j's tail
240					// or any of the i's tail is deterministically earlier than j's head, they don't interfere.
241					const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
242					int j_hop_i = 0;
243					if (i_hop_j > 0)
244					{
245						ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
246						ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
247					} else {
248						// It cannot be that both directions are positive. If i can hop to j, we don't
249						// need the reverse hop value for any subsequent allocation decision.
250						j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
251						if (j_hop_i > 0)
252						{
253							ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
254							ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
255						}
256					}
257					if (!i_hop_j && !j_hop_i)
258					{
259						if (!adj[i].itf)
260							adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
261						ccv_array_push(adj[i].itf, &j);
262						++adj[i].oc;
263						if (!adj[j].itf)
264							adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
265						ccv_array_push(adj[j].itf, &i);
266						++adj[j].oc;
267					}
268				}
269	const int exec_dep_rows = exec_dep->rows;
270	ccv_matrix_free(exec_dep);
271	ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
272	int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
273	uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
274	uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
275	uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
276	int num_assigned = 0; 
277	// I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
278	// Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
279	// The first channel denotes the bytes available for allocation,
280	// the second channel denotes the offset available for the allocation,
281	ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
282	ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
283	for (j = 0; j < allocable_tensor_size;)
284	{
285		// Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
286		uint64_t max_size = 0;
287		ccv_array_clear(opt);
288		int current_type = 0; // Deal with one type at a time.
289		for (i = 0; i < tensor_block_size; i++)
290			if (tensor_blocks[i].size >= max_size &&
291				TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && !assigned[i] &&
292				IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
293				(!current_type || tensor_blocks[i].type == current_type))
294			{
295				ccv_nnc_tensor_opt_t a = {
296					.size = tensor_blocks[i].size,
297					.index = i,
298					.oc = adj[i].oc,
299					.type = tensor_blocks[i].type,
300				};
301				assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 301, __extension__ __PRETTY_FUNCTION__); }));
302				current_type = a.type; // Now we now the primary type we should deal with.
303				if (tensor_blocks[i].companion_ref)
304				{
305					const int companion_ref = tensor_blocks[i].companion_ref - 1;
306					a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; });
307					a.oc += adj[companion_ref].oc;
308				}
309				// In case we have a tie, take them all in the array.
310				if (a.size > max_size)
311					ccv_array_clear(opt), max_size = a.size;
312				ccv_array_push(opt, &a);
313			}
314		assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
 ({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 314, __extension__ __PRETTY_FUNCTION__
); }));
315		// Order opt array by the oc because type and size should be equal at this point.
316		_ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
317		// Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
318		int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
319		uint64_t min_val[2] = {
320			0, 0
321		};
322		if (j > 0)
323		{
324			for (i = 0; i < opt->rnum; i++)
325			{
326				ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(i)));
327				if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
328					continue;
329				// Now, determine the order between a and c. After this, we can always check whether y
330				// can hop to the earliest one and if the latest one can hop to x.
331				// The earliest one will be called p and the latest one will be called q.
332				int p = a.index;
333				int q = a.index;
334				if (tensor_blocks[a.index].companion_ref)
335				{
336					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
337					if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
338						continue;
339					const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
340					if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
341						p = companion_ref;
342					else {
343						const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
344						if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
345							q = companion_ref;
346						else { // Otherwise, b is in between p and q.
347							const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
348							const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
349							assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
 && b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
 : 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
 0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 349, __extension__ __PRETTY_FUNCTION__
); }));
350						}
351					}
352				}
353				assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 353, __extension__ __PRETTY_FUNCTION__
); }));
354				const int type = tensor_blocks[p].type;
355				// y is always earlier than x, but this is hard to assert now.
356				// If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
357				// Thus, the hop between y and x (through a) should be smallest ones.
358				// We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
359				// out of q. For these nodes, we try to verify whether they form a connection (by checking against
360				// alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
361				int y_size = 0;
362				ccv_nnc_tensor_hop_t* const y_buf = buf;
363#define for_block(y, val) do { \
364					if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \
365						y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
366							.idx = y + 1, .hop = ((int*)val)[0] \
367						}; \
368				} while(0)
369				ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
370				if (y_vector)
371					CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
372#undef for_block
373				assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
 ({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 373, __extension__ __PRETTY_FUNCTION__); }));
374				int x_size = 0;
375				ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
376#define for_block(x, val) do { \
377					if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \
378						x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
379							.idx = x + 1, .hop = ((int*)val)[0] \
380						}; \
381				} while(0)
382				ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
383				if (x_vector)
384					CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
385#undef for_block
386				assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
 : 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 386, __extension__ __PRETTY_FUNCTION__
); }));
387				int x, y;
388				if (y_size > 1)
389					_ccv_nnc_sort_by_hops(y_buf, y_size, 0);
390				for (y = 0; y < y_size; y++)
391				{
392					const int hop = exec_dep_rows + y_buf[y].hop;
393					if (hop >= min_hop)
394						break;
395					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
396					if (val.u64 && val.u64[0] >= a.size)
397					{
398						min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
399							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
400						break;
401					}
402				}
403				if (x_size > 1)
404					_ccv_nnc_sort_by_hops(x_buf, x_size, 0);
405				for (x = 0; x < x_size; x++)
406				{
407					const int hop = exec_dep_rows + x_buf[x].hop;
408					if (hop >= min_hop)
409						break;
410					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
411					if (val.u64 && val.u64[0] >= a.size)
412					{
413						min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
414							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
415						break;
416					}
417				}
418				if (x_size > 0)
419				{
420					const int x_min_hop = x_buf[0].hop;
421					for (y = 0; y < y_size; y++)
422					{
423						const int y_hop_p_v = y_buf[y].hop;
424						if (y_hop_p_v + x_min_hop >= min_hop)
425							break;
426						ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
427						if (y_vector)
428						{
429							for (x = 0; x < x_size; x++)
430							{
431								const int q_hop_x_v = x_buf[x].hop;
432								const int hop = y_hop_p_v + q_hop_x_v;
433								if (hop >= min_hop)
434									break;
435								const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
436								if (val.u64 && val.u64[0] >= a.size)
437								{
438									min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
439										min_val[0] = val.u64[0], min_val[1] = val.u64[1];
440									break;
441								}
442							}
443						}
444					}
445				}
446				// If I found a place, stop, and exit.
447				if (min_y > 0 || min_x < tensor_block_size + 1)
448				{
449					min_i = i;
450					break;
451				}
452				// There is no space to insert this block, mark it as such.
453				tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
454				if (tensor_blocks[a.index].companion_ref)
455				{
456					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
457					tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
458				}
459			}
460		}
461		// If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
462		// and default to largest size available.
463		ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))));
464		if (min_i == -1)
465		{
466			allocated_size[num_assigned] = a.size;
467			++num_assigned;
468		}
469		int assign_group = num_assigned;
470		if (min_y > 0)
471		{
472			assign_group = assigned[min_y - 1];
473			// The y and x should belong to the same assigned group.
474			assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
 - 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
 tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 474, __extension__ __PRETTY_FUNCTION__
); }));
475		} else if (min_x < tensor_block_size + 1)
476			assign_group = assigned[min_x - 1];
477		// If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
478		if (min_y != 0 || min_x != tensor_block_size + 1)
479		{
480			uint64_t val[2] = {
481				min_val[0], min_val[1]
482			};
483			assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
 ({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 483, __extension__ __PRETTY_FUNCTION__
); }));
484			val[0] -= a.size;
485			val[1] = val[1] + a.size; // Move the offset to the next one.
486			ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
487		}
488		int strings[3];
489		strings[0] = a.index + 1;
490		int string_size = 1;
491		// Assign out designated companion if it exist.
492		if (tensor_blocks[a.index].companion_ref)
493		{
494			const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
495			assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
 ("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 495, __extension__ __PRETTY_FUNCTION__
); }));
496			const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
497			if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
498			{
499				for (i = 0; i < string_size; i++)
500					strings[i + 1] = strings[i];
501				strings[0] = companion_ref + 1;
502			} else {
503				const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
504				if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
505					strings[string_size] = companion_ref + 1;
506				else {
507					// Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
508					assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
 if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 508, __extension__ __PRETTY_FUNCTION__
); }));
509					strings[2] = strings[1];
510					strings[1] = companion_ref + 1;
511				}
512			}
513			++string_size;
514		}
515		// Assign out and update oc.
516		for (i = 0; i < string_size; i++)
517		{
518			const int index = strings[i] - 1;
519			// Assign out the selected one.
520			assigned[index] = assign_group;
521			// The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
522			allocated_offset[index] = min_val[1];
523			if (adj[index].itf)
524				for (k = 0; k < adj[index].itf->rnum; k++)
525				{
526					const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[
index].itf)->rsize * (size_t)(k)));
527					if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !(
(tensor_blocks[d].flags & 0x3) == UNASSIGNED)))
528						--adj[d].oc;
529				}
530		}
531		uint64_t val[2] = {
532			a.size, min_val[1]
533		};
534		uint64_t consumed_size = 0;
535		// Go over from min_y to string_size (excluding min_x).
536		for (i = 0; i < string_size; i++)
537		{
538			const uint64_t size = tensor_blocks[strings[i] - 1].size;
539			assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 539, __extension__ __PRETTY_FUNCTION__
); }));
540			// Update consumed size if it is bigger than "size".
541			if (size > consumed_size)
542			{
543				val[0] = size - consumed_size;
544				ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
545				consumed_size = size;
546				val[1] = min_val[1] + consumed_size;
547			}
548			// If it consumed all the flow, break out.
549			if (consumed_size == a.size)
550				break;
551		}
552		for (i = 0; i < string_size; i++)
553		{
554			const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
555			uint64_t val[2] = {
556				i_size, min_val[1]
557			};
558			uint64_t consumed_size = 0;
559			for (k = i + 1; k < string_size; k++)
560			{
561				const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
 < _b) ? _a : _b; });
562				// Update consumed size if it is bigger than "size".
563				if (size > consumed_size)
564				{
565					val[0] = size - consumed_size;
566					ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
567					consumed_size = size;
568					val[1] = min_val[1] + consumed_size;
569				}
570				// If it consumed all the flow, break out.
571				if (consumed_size == i_size)
572					break;
573			}
574			val[0] = i_size - consumed_size;
575			// Still have residual, flow it to min_x.
576			if (val[0] > 0)
577				ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
578		}
579		if (min_i == -1)
580		{
581			// If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
582			const int p = strings[0] - 1;
583			const int q = strings[string_size - 1] - 1;
584			const int type = tensor_blocks[p].type;
585#define for_block(y, val) do { \
586				if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \
587				{ \
588					tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
589					if (tensor_blocks[y].companion_ref) \
590					{ \
591						const int companion_ref = tensor_blocks[y].companion_ref - 1; \
592						tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
593					} \
594				} \
595			} while(0)
596			ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
597			if (y_vector)
598				CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
599#undef for_block
600#define for_block(x, val) do { \
601				if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \
602				{ \
603					tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
604					if (tensor_blocks[x].companion_ref) \
605					{ \
606						const int companion_ref = tensor_blocks[x].companion_ref - 1; \
607						tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
608					} \
609				} \
610			} while(0)
611			ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
612			if (x_vector)
613				CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
614#undef for_block
615		}
616		j += string_size;
617	}
618	ccfreefree(tensor_block_cannot_insert);
619	ccfreefree(buf);
620	ccv_array_free(opt);
621	ccv_matrix_free(tensor_df);
622	ccv_matrix_free(tensor_dt);
623#define for_block(y, x, val) do { \
624		if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
625		{ \
626			if (!alloc_dep[x - 1]) \
627				alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
628			ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
629		} \
630	} while (0)
631	CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
 ((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
 _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
 = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
 _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
 !_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
 (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
 { switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
 } while (0);
632#undef for_block
633	ccv_matrix_free(alloc);
634	for (i = 0; i < tensor_block_size; i++)
635		if (adj[i].itf)
636			ccv_array_free(adj[i].itf);
637	ccfreefree(adj);
638	ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
639	alloc_prep->alloc_dep = alloc_dep;
640	alloc_prep->vt_block_size = tensor_block_size;
641	alloc_prep->buffer_size = num_assigned;
642	alloc_prep->block_size = available_tensor_size;
643	alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
644	alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
645	alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
646	memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
647	for (i = 0; i < num_assigned; i++)
648		alloc_prep->buffers[i].size = allocated_size[i];
649	if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels()))
650	{
651		size_t total_size = 0;
652		for (i = 0; i < num_assigned; i++)
653			total_size += allocated_size[i];
654		PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf
("Total buffer size of %zu to be allocated\n", total_size); fflush
(stdout); } } while (0);
655	}
656	ccfreefree(allocated_size);
657	j = 0;
658	// Assigning out the tensors (in case of sharing tensors / in-place ops).
659	for (i = 0; i < tensor_block_size; i++)
660		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
661		{
662			alloc_prep->blocks[j].block_ref = i;
663			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
664			{
665				alloc_prep->vt_blocks[i] = j;
666				// Also, set its allocations.
667				assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 667, __extension__ __PRETTY_FUNCTION__
); }));
668				const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
669				alloc_prep->blocks[j].offset = allocated_offset[i];
670				if (!alloc_prep->buffers[buffer_ref].type)
671					alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
672				alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
673				alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
674				assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
 alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
 ({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 674, __extension__ __PRETTY_FUNCTION__
); }));
675			} else {
676				alloc_prep->vt_blocks[i] = -1;
677				alloc_prep->blocks[j].buffer_ref = -1;
678				alloc_prep->blocks[j].offset = 0;
679			}
680			++j;
681		} else
682			alloc_prep->vt_blocks[i] = -1;
683	ccfreefree(allocated_offset);
684	ccfreefree(assigned);
685	return alloc_prep;
686}
687 
688static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
689{
690	int i;
691	for (i = 0; i < alloc_prep->vt_block_size; i++)
692		if (alloc_prep->alloc_dep[i])
693			ccv_array_free(alloc_prep->alloc_dep[i]);
694	for (i = 0; i < alloc_prep->buffer_size; i++)
695		if (alloc_prep->buffers[i].dup_p_refs)
696			ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
697	ccfreefree(alloc_prep->alloc_dep);
698	ccfreefree(alloc_prep);
699}
700 
701// Simple allocator from ccv_array_t.
702static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
703{
704	int pos = tensor_metadata->rnum;
705	int rsize = (size + 15) / 16;
706	ccv_array_resize(tensor_metadata, pos + rsize);
707	return (pos << 1) + 1;
708}
709 
710static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
711{
712	assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 712, __extension__ __PRETTY_FUNCTION__
); }));
713	return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)));
714}
715 
716#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
717 
718static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
719{
720	// If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
721	if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
722		return vt_tensor;
723	ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
724	if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
725	{
726		const int alias_ref = tensor->alias_ref;
727		tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
728		_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
729	}
730	if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
731	{
732		ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
733		int i;
734		const int count = mv->kind + mv->repeat;
735		for (i = 0; i < count; i++)
736		{
737			if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1))
738			{
739				const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
740				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
741				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
742			}
743		}
744		// No need to recursively do parent pointer, otherwise we are in deep rewire.
745		if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
746			mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
747		if (mv->sp)
748			for (i = 0; i < mv->sp->rnum; i++)
749			{
750				ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
751				if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
752				{
753					const int pos = (int)(intptr_t)*tensor;
754					*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
755					assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 755, __extension__ __PRETTY_FUNCTION__
); }));
756					_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
757				}
758			}
759	}
760	return tensor;
761}
762 
763typedef struct {
764	const uint8_t* ptr;
765	int pos;
766} ccv_nnc_tensor_block_pos_t;
767 
768static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
769{
770	int i;
771	int unref_block_ref = block_ref;
772	while (prep->tensor_blocks[unref_block_ref].ref)
773		unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
774	int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
775	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 775, __extension__ __PRETTY_FUNCTION__); }));
776	assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
 == prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
 ("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 776, __extension__ __PRETTY_FUNCTION__
); }));
777	const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
778	uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
779	int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
780	for (i = idx - 1; i >= 0; i--)
781	{
782		assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
 (p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 782, __extension__ __PRETTY_FUNCTION__); }));
783		const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
784		const int unroll_count = graph_prep->unroll_count;
785		if (ch[i]) // Prefer the dup side of things.
786			p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
787		int unref_p_ref = p_ref;
788		while (graph_prep->tensor_blocks[unref_p_ref].ref)
789			unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
790		vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
791		const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
792		offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
793		// If the buffer already exists, prefer that.
794		const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
795		if (ptr)
796		{
797			// If I have any remaining path that is not covered from 0, I cannot possibly
798			// have any pointer from buffer (that can only happen if it is not dup).
799			for (--i; i >= 0; i--)
800				if (ch[i] != 0)
801					return 0;
802			// Try to find the created tensor block pos in the array, just linear scan.
803			const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
804			ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
805			*tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
806			ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
807			return tv_pos;
808		}
809		p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
810	}
811	return 0;
812}
813 
814// Descent from root to the prep level, and compose multiview from there.
815static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
816{
817	assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 817, __extension__ __PRETTY_FUNCTION__); }));
818	int i;
819	const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
820	const int unroll_count = prep->unroll_count;
821	if (prep == graph_prep)
822	{
823		const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
824		if (!data_pos)
825			return -1;
826		// Based on ch, go all the way back to find the exact pointer to compose.
827		if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
828			prep->dup_tensor_block_ref &&
829			prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
830			prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
831		{
832			int pos[unroll_count + 1];
833			pos[0] = data_pos;
834			for (i = 0; i < unroll_count; i++)
835				pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
836			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
837			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
838			ccv_nnc_tensor_t* data[unroll_count + 1];
839			for (i = 0; i < unroll_count + 1; i++)
840				data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
841			ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
842			for (i = 0; i < unroll_count + 1; i++)
843				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
844			*pos_ref = mv_pos;
845		} else {
846			*pos_ref = data_pos;
847		}
848		if (preserve)
849		{
850			// If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
851			// at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
852			// mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
853			// mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
854			// arena allocated).
855			// mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
856			// a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
857			// it to a K01 structure.
858			// Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
859			// to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
860			// memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
861			int prev_mv_pos = *pos_ref;
862			if (prev_mv_pos == -1)
863			{
864				prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
865				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
866				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
867				ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
868					tv,
869				}, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
870				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
871			}
872			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
873			ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
874			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
875			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
876				CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
877				(ccv_nnc_tensor_t*)prev_mv,
878			}, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
879			prev_mv->p = (void*)(intptr_t)mv_pos;
880			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
881			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
882			*pos_ref = mv_pos;
883		}
884		return 0;
885	}
886	ch[idx] = 0;
887	int pos[unroll_count + 1];
888	pos[0] = 0;
889	const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
890	assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 890, __extension__ __PRETTY_FUNCTION__); }));
891	for (i = 0; i < unroll_count; i++)
892	{
893		ch[idx] = i + 1;
894		pos[i + 1] = 0;
895		const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
896		if (dup_retval < 0)
897		{
898			assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 898, __extension__ __PRETTY_FUNCTION__); }));
899			break;
900		}
901	}
902	// If current prep has no dup.
903	if (i == 0)
904	{
905		*pos_ref = pos[0];
906		return 0;
907	}
908	ccv_nnc_tensor_t* data[unroll_count + 1];
909	// Compose to a new multiview.
910	for (i = 0; i < unroll_count + 1; i++)
911		{ assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
 (pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 911, __extension__ __PRETTY_FUNCTION__); })); }
912	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
913	for (i = 0; i < unroll_count + 1; i++)
914		data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
915	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
916	ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
917	for (i = 0; i < unroll_count + 1; i++)
918		if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
919			((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
920	for (i = 0; i < unroll_count + 1; i++)
921		CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
922	*pos_ref = mv_pos;
923	return 0;
924}
925 
926static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
927{
928	int i;
929	int is_input = 0;
930	assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
 else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 930, __extension__ __PRETTY_FUNCTION__); }));
931	for (i = 0; i < node->input_size && !is_input; i++)
932		if (p_ref == node->inputs[i])
933			is_input = 1;
934	int is_output = 0;
935	for (i = 0; i < node->output_size && !is_output; i++)
936		if (p_ref == node->outputs[i])
937			is_output = 1;
938	// Prefer it is an output if it is both the input and the output.
939	if (is_output)
940		return 1;
941	if (is_input)
942		return -1;
943	return 0;
944}
945 
946static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
947{
948	// No need to check whether to preserve if this is not a while loop.
949	if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
950		return 0;
951	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 951, __extension__ __PRETTY_FUNCTION__
); }));
952	// If it is unassigned, no need to preserve.
953	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
954		return 0;
955	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
956	// If p is not input, no need to preserve at all.
957	if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
958		return 0;
959	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
960	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 960, __extension__ __PRETTY_FUNCTION__); }));
961	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 961, __extension__ __PRETTY_FUNCTION__
); }));
962	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
963	// If the buffer is a truly read-only one, no need to preserve.
964	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
965		return 0;
966	/* This needs detailed explanation, what does preserve mean?
967	 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
968	 * also used outside of the while loop, we cannot reuse the memory region of x for
969	 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
970	 * y uses the same memory region as x). The way to workaround this is by using a different
971	 * memory region for y = x + 1, but for the first iteration, having x pointing to the
972	 * original. During the allocation process, the way to identify whether x should preserve
973	 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
974	 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
975	 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
976	 * it is the input tensor whenever that is possible. A tensor block can point to two parent
977	 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
978	 * tensor whenever that is possible. */
979	if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
980		return 0;
981	// Otherwise, return 1 because we now need to preserve.
982	return 1;
983}
984 
985static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
986{
987	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 987, __extension__ __PRETTY_FUNCTION__
); }));
988	// If it is unassigned, no need to preserve.
989	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
990		return 0;
991	// Only tape var need to force broadcast, otherwise we already share the same memory region.
992	if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
993		return 0;
994	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
995	// If p is not output, no need to broadcast at all.
996	if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
997		return 0;
998	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
999	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 999, __extension__ __PRETTY_FUNCTION__); }));
1000	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1000, __extension__ __PRETTY_FUNCTION__
); }));
1001	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
1002	// If the buffer is a truly read-only one, no need to broadcast.
1003	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
1004		return 0;
1005	// Otherwise, return 1 because we now need to force broadcast for this tape var.
1006	return 1;
1007}
1008 
1009static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
1010{
1011	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1011, __extension__ __PRETTY_FUNCTION__); }));
1012	int i;
1013	for (i = 0; i < mv->kind + mv->repeat; i++)
1014		if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
1015			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = tensor;
1016		else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
1017			_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i], tensor);
1018}
1019 
1020static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1021{
1022	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1022, __extension__ __PRETTY_FUNCTION__); }));
1023	int i;
1024	if (mv->sp)
1025		for (i = 0; i < mv->sp->rnum; i++)
1026		{
1027			ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
1028			if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
1029			{
1030				const int pos = (int)(intptr_t)*tensor;
1031				*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1032				assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 1032, __extension__ __PRETTY_FUNCTION__
); }));
1033				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1034			}
1035		}
1036	for (i = 0; i < mv->kind + mv->repeat; i++)
1037	{
1038		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]) & 1))
1039			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1040		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]->alias_ref) & 1))
1041			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref);
1042		if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
1043			_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1044	}
1045}
1046 
1047static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1048{
1049	// Go to the root of the graph.
1050	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1051	int i;
1052	for (i = 1; prep->p; i++)
1053		prep = prep->p;
1054	// Root graph should have no dup tensor blocks.
1055	assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
 ({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1055, __extension__ __PRETTY_FUNCTION__); }));
1056	const int c = i;
1057	const ccv_nnc_symbolic_graph_prep_t* preps[c];
1058	prep = graph_prep;
1059	preps[c - 1] = prep;
1060	for (i = 0; prep->p; i++)
1061		preps[c - 2 - i] = prep = prep->p;
1062	int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1063	memset(ch, 0, sizeof(int) * c);
1064	int pos = 0;
1065	_ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1066	assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
 (ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1066, __extension__ __PRETTY_FUNCTION__); })); // This shouldn't never be modified.
1067	assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
 > 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1067, __extension__ __PRETTY_FUNCTION__); }));
1068	return pos;
1069}
1070 
1071static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1072{
1073	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1074	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1075	ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1076	ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1077		CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1078		tv,
1079	}, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1080	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1081	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = tensor;
1082	return mv_pos;
1083}
1084 
1085static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1086{
1087	ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1088	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1089	if (!is_multiview)
1090		return pos;
1091	while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1092	{
1093		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1094		tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1095	}
1096	const ccv_nnc_tensor_t tensor = *tensor_ptr;
1097	const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1098	ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1099	*new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1100	new_tensor->dataof = tensor.dataof;
1101	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1102	new_tensor->alias_ref = (uintptr_t)pos;
1103	ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1104	return new_pos;
1105}
1106 
1107static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1108{
1109	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1110	// It referenced to is not an alias.
1111	assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__
 ({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1111, __extension__ __PRETTY_FUNCTION__
); }));
1112	const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1113	const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1114	assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1114, __extension__ __PRETTY_FUNCTION__
); }));
1115	// Will use that to determine whether insert reference or not.
1116	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1117	while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1118	{
1119		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1120		alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1121	}
1122	const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1123	// If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1124	int pos;
1125	if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1126		ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim))
1127	{
1128		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1129		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1130		*tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1131		tensor->dataof = alias_tensor.dataof;
1132	} else {
1133		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1134		ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1135		// Otherwise initialize a tensor view
1136		*tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1137		tensor_view->alias_ref = (uintptr_t)alias_pos;
1138	}
1139	vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1140	if (is_multiview)
1141	{
1142		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1143		ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1144	}
1145}
1146 
1147static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1148{
1149	// If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1150	if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref])
1151	{
1152		const int ref = tensor_blocks[block_ref].alias_ref - 1;
1153		if (!vt_tensors[ref])
1154			_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1155		vt_tensors[block_ref] = vt_tensors[ref];
1156		return;
1157	}
1158	assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1158, __extension__ __PRETTY_FUNCTION__
); }));
1159	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1160	// If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1161	if (!vt_tensors[alias_ref])
1162		_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1163	_ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1164}
1165 
1166// Turn a linear pointer to an object storage (such as MTLBuffer).
1167#ifdef HAVE_MPS
1168static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1169{
1170	mpobjfree(0, ptr);
1171}
1172#endif
1173 
1174typedef struct {
1175	size_t size;
1176	void* obj;
1177} tensor_arena_obj_track_t;
1178 
1179typedef struct {
1180	void* ptr;
1181	off_t offset;
1182	size_t size;
1183} obj_ptr_key_t;
1184 
1185static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1186{
1187	return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1188}
1189 
1190static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1191{
1192	return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1193}
1194 
1195KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied
, upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* *
vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__
)) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t
*)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__
 ((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if
 (h) { free((void *)h->keys); free(h->flags); free((void
 *)h->vals); free(h); } } static inline __attribute__ ((__unused__
)) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h
->flags) { memset(h->flags, 0xaa, ((h->n_buckets) <
 16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t
 key) { if (h->n_buckets) { khint_t k, i, last, mask, step
 = 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key
); i = k & mask; last = i; while (!((h->flags[i>>
4]>>((i&0xfU)<<1))&2) && (((h->
flags[i>>4]>>((i&0xfU)<<1))&1) || !
_kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step
)) & mask; if (i == last) return h->n_buckets; } return
 ((h->flags[i>>4]>>((i&0xfU)<<1))&
3)? h->n_buckets : i; } else return 0; } static inline __attribute__
 ((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t
 new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (
--(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1,
 (new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|=
(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>>
8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
 >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys
 = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets *
 sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return
 -1; } h->keys = new_keys; if (1) { void* *new_vals = (void
**)realloc((void *)h->vals,new_n_buckets * sizeof(void*));
 if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals
; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if
 (((h->flags[j>>4]>>((j&0xfU)<<1))&
3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t
 new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals
[j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<<
1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t
 tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp
; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<
1)); } else { h->keys[i] = key; if (1) h->vals[i] = val
; break; } } } } if (h->n_buckets > new_n_buckets) { h->
keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets
 * sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc
((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h->
flags); h->flags = new_flags; h->n_buckets = new_n_buckets
; h->n_occupied = h->size; h->upper_bound = (khint_t
)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static
 inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t
 *h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied
 >= h->upper_bound) { if (h->n_buckets > (h->size
<<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) <
 0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr
(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets
; } } { khint_t k, i, site, last, mask = h->n_buckets - 1,
 step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
 (!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key
))) { if (((h->flags[i>>4]>>((i&0xfU)<<
1))&1)) site = i; i = (i + (++step)) & mask; if (i ==
 last) { x = site; break; } } if (x == h->n_buckets) { if (
((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && site != h->n_buckets) x = site; else x = i; }
 } } if (((h->flags[x>>4]>>((x&0xfU)<<
1))&2)) { h->keys[x] = key; (h->flags[x>>4]&=
~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->
n_occupied; *ret = 1; } else if (((h->flags[x>>4]>>
((x&0xfU)<<1))&1)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; *ret = 2; } else *ret = 0; return x; } static
 inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t
 *h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
 --h->size; } }
1196 
1197static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1198{
1199	if (params.dim[0] == 0)
1200		return 0;
1201#ifdef HAVE_MPS
1202	if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY)
1203	{
1204		int ret;
1205		const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >>
 12] * ccv_nnc_tensor_count(params);
1206		const obj_ptr_key_t key = {
1207			.ptr = ptr,
1208			.offset = offset,
1209			.size = size,
1210		};
1211		khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret);
1212		if (ret != 0)
1213		{
1214			void* obj = mpobjcreate(ptr, offset, size);
1215			if (!tensor_arena->disposers)
1216				tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1217			ccv_nnc_arena_disposer_t disposer = {
1218				.ptr = obj,
1219				.userdata = 0,
1220				.dispose = _ccv_nnc_tensor_arena_obj_dispose
1221			};
1222			ccv_array_push(tensor_arena->disposers, &disposer);
1223			kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj;
1224			return obj;
1225		} else
1226			return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]);
1227	}
1228#endif
1229	return ptr + offset;
1230}
1231 
1232static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1233{
1234	// All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1235	// Each tensor have the designation in assigned array, and offset in allocated_offset.
1236	const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1237	ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1238	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1239	const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1240	const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1241	const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1242	const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1243	const int unroll_count = graph_prep->unroll_count;
1244	int i, j;
1245	for (i = 0; i < tensor_symbol_info_size; i++)
1246		for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1247		{
1248			const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1249			if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1250				TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
);
1251		}
1252	ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1253	graph_prep->tensor_arena = tensor_arena;
1254	tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1255	tensor_arena->buffers = (void*)(tensor_arena + 1);
1256	tensor_arena->buffer_size = alloc_prep->buffer_size;
1257	tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1258	tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1259	tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1260	tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1261	tensor_arena->pb_vt_tensors = 0;
1262	tensor_arena->vt_alias_r_refs_p = 0;
1263	tensor_arena->vt_alias_r_refs = 0;
1264	tensor_arena->vt_sizes = 0;
1265	tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1266	tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, (sizeof(ccv_nnc_tensor_t) * tensor_symbol_info_size + 15) / 16, 0);
1267	tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1268	tensor_arena->allocator.context.free = allocator.context.free;
1269	tensor_arena->allocator.isa = allocator.isa;
1270	tensor_arena->disposers = 0;
1271	// Copy alias_ref info back to the tensor arena.
1272	for (i = 0; i < tensor_symbol_info_size; i++)
1273		tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1274	// Do the buffer copies.
1275	for (i = 0; i < alloc_prep->buffer_size; i++)
1276		tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1277			tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1278			tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1279	if (graph_prep->while_count_tensor)
1280	{
1281		// If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1282		int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1283		assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
 ({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1283, __extension__ __PRETTY_FUNCTION__
); })); // pos must be 0 position.
1284		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1285		*tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1286	}
1287	assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
 && !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
 && p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1287, __extension__ __PRETTY_FUNCTION__
); }));
1288	if (p_arena && p_graph_prep)
1289	{
1290		// Don't need to allocate the actual buffer, just use the pointer from the above.
1291		PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer assignment for sub arena %p (parent %p)\n",
 tensor_arena, p_arena); fflush(stdout); } } while (0);
1292		for (i = 0; i < tensor_arena->buffer_size; i++)
1293		{
1294			const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1295			int unref_p_ref = p_ref;
1296			while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1297				unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1298			assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
 ({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1298, __extension__ __PRETTY_FUNCTION__
); }));
1299			const int p_unroll_count = p_graph_prep->unroll_count;
1300			if (p_graph_prep->dup_tensor_block_ref &&
1301				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1302				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1303			{
1304				// This condition means in the parent graph, we point to multiple tensor blocks for the same
1305				// buffer, therefore, we cannot have one single pointer assigned in this case.
1306				// Later we will handle this by generate ccv_tensor_multiview_t structure.
1307				tensor_arena->buffers[i].ptr = 0;
1308				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1309				continue;
1310			}
1311			// Otherwise, find the actual buffer pointer.
1312			const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1313			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1313, __extension__ __PRETTY_FUNCTION__); }));
1314			const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1315			if (!p_arena->buffers[buffer_ref].ptr)
1316			{
1317				// Pass it down as 0 ptr.
1318				tensor_arena->buffers[i].ptr = 0;
1319				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1320				continue;
1321			}
1322			const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1323			tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1324			PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
 (0);
1325		}
1326	} else {
1327		// Now, allocate actual buffers.
1328		PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0);
1329		for (i = 0; i < tensor_arena->buffer_size; i++)
1330		{
1331			const int buffer_type = tensor_arena->buffers[i].type;
1332			const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1333			if (tensor_arena->buffers[i].size == 0)
1334			{
1335				tensor_arena->buffers[i].ptr = 0;
1336				PRINT(CCV_CLI_VERBOSE, "|-Skip buffer %d with size 0\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Skip buffer %d with size 0\n", i); fflush(stdout
); } } while (0);
1337				continue;
1338			}
1339#ifdef HAVE_CUDA1
1340			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1341			{
1342				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1343				if (allocator.isa && allocator.isa->alloc)
1344					tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1345				else
1346					tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1347				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1348			} else {
1349				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1349, __extension__ __PRETTY_FUNCTION__
); }));
1350				if (tensor_arena->buffers[i].pin_mem)
1351					tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1352				else
1353					ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1354				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1355			}
1356#elif defined(HAVE_MPS)
1357			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1358			{
1359				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1360				// if (allocator.isa && allocator.isa->alloc)
1361				// 	tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1362				// else
1363				tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1364				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1365			} else {
1366				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1366, __extension__ __PRETTY_FUNCTION__
); }));
1367				ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1368				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1369			}
1370#else
1371			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1371, __extension__ __PRETTY_FUNCTION__
); }));
1372			ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1373			PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1374#endif
1375			assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
 ({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
 ("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1375, __extension__ __PRETTY_FUNCTION__); }));
1376		}
1377	}
1378	// Go over sub_preps and allocate arenas for them. Do it this early because
1379	// we may reference tensors from sub arenas, the reason why we need to reference
1380	// tensors from sub arenas is because for output tensors, sub arena's tensor
1381	// will have automatic reference updates.
1382	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1383		if (graph_prep->sub_preps[i])
1384			tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1385		else
1386			tensor_arena->sub_arenas[i] = 0;
1387	memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1388	// Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1389	ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1390#ifdef HAVE_MPS
1391	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr();
1392#else
1393	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0;
1394#endif
1395	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1396		if (tensor_arena->sub_arenas[i])
1397		{
1398			assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
 ({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1398, __extension__ __PRETTY_FUNCTION__
); }));
1399			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1400			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1401			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1402				for (j = 0; j < node->output_size; j++)
1403				{
1404					const int idx = node->outputs[j];
1405					const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1;
1406					assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
 (s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1406, __extension__ __PRETTY_FUNCTION__); }));
1407					ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1408					assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
 ({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
 ("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1408, __extension__ __PRETTY_FUNCTION__); }));
1409					ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1410					// Only assign if it is a multiview tensor.
1411					if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1412						(sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1413						sub_arena_out_tensors[idx] = sub_tensor;
1414				}
1415		}
1416	// Assigning out the tensors (in case of sharing tensors / in-place ops).
1417	for (i = 0; i < tensor_symbol_info_size; i++)
1418		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
1419		{
1420			const int vt_ref = alloc_prep->vt_blocks[i];
1421			const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1422			// Either we have dup_tensor_block_ref in current layer, or we have that in
1423			// previous layer, therefore, cannot really find the buffer ptr.
1424			if (tensor_blocks[i].size > 0 &&
1425				(!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1426				((graph_prep->dup_tensor_block_ref &&
1427				  graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1428				  graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1429				 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1430			{
1431				assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1431, __extension__ __PRETTY_FUNCTION__
); })); // This must be in a sub-graph.
1432				// If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1433				if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1434					continue;
1435				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1436				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1437				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1438			} else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1439				if (tensor_blocks[i].size == 0)
1440				{
1441					int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1442					ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1443					*tensor = ccv_nnc_tensor(0, tensor_symbol_info[i].info, 0);
1444					tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1445					continue;
1446				}
1447				// When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1448				const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1449				// If already created, use the same tensor, and continue.
1450				// Having ptr.
1451				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1452				ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1453				// Also, set its allocations.
1454				// Since tensor view is bit compatible with tensor, we can just cast.
1455				void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1456				*tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1457				assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1457, __extension__ __PRETTY_FUNCTION__
); }));
1458				// If we need to force broadcast, we need to wrap it in a multiview.
1459				if (graph_prep->tensor_blocks[i].p_refs[0] &&
1460					_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1461				{
1462					const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1463					ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1464					ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1465					ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1466						tv,
1467					}, 0, 1, graph_prep->graph, mv);
1468					CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1469					pos = mv_pos;
1470					ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1471				}
1472				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1473			}
1474		}
1475#ifdef HAVE_MPS
1476	kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map);
1477#endif
1478	// Handle binded tensors. First handle cases without aliases.
1479	for (i = 0; i < tensor_bind_size; i++)
1480	{
1481		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1481, __extension__ __PRETTY_FUNCTION__
); }));
1482		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1483		if (resolved_symbol.d >= 0)
1484		{
1485			int d = resolved_symbol.d;
1486			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1487				continue;
1488			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1489			// It has nothing to do with alias.
1490			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1491				d = tensor_blocks[d].ref - 1;
1492			// For binded tensors, it shouldn't be assigned yet.
1493			// If it is assigned, the pointer should match the ones from the binded tensor.
1494			// This can only happen if an enforced in-place tensor is binded twice. If that
1495			// happens, we need to make sure it is binded to the same location.
1496			assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1496, __extension__ __PRETTY_FUNCTION__
); }));
1497			// See above assertion.
1498			if (tensor_arena->vt_tensors[d])
1499				continue;
1500			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1501			{
1502				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1503				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1504				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1505				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1506					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1507						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1507, __extension__ __PRETTY_FUNCTION__
); })); }
1508				// It is OK to be just as a whole smaller or equal to the binded one.
1509				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1509, __extension__ __PRETTY_FUNCTION__
); }));
1510				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1511				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1512				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1513			} else {
1514				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1515				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1516				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1517				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1518				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1519				tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1520				tv->dataof = tensor_binds[i].tensor->dataof;
1521				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1522			}
1523		}
1524	}
1525	// Handle binded tensors. We handle alias here so it can reference to binded tensors.
1526	for (i = 0; i < tensor_bind_size; i++)
1527	{
1528		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1528, __extension__ __PRETTY_FUNCTION__
); }));
1529		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1530		if (resolved_symbol.d >= 0)
1531		{
1532			int d = resolved_symbol.d;
1533			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1534				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1535			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1536			// It has nothing to do with alias.
1537			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1538				d = tensor_blocks[d].ref - 1;
1539			if (tensor_arena->vt_tensors[d])
1540				continue;
1541			// Assert original alias has no ofs. Otherwise our binding will be problematic.
1542			for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1543				{ assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j]
 == 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol
.d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1543, __extension__ __PRETTY_FUNCTION__
); })); }
1544			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1545			{
1546				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1547				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1548				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1549				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1550					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1551						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1551, __extension__ __PRETTY_FUNCTION__
); })); }
1552				// It is OK to be just as a whole smaller or equal to the binded one.
1553				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1553, __extension__ __PRETTY_FUNCTION__
); }));
1554				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1555				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1556				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1557			} else {
1558				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1559				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1560				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1561				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1562				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1563				tv->data = tensor_binds[i].tensor->data;
1564				tv->dataof = tensor_binds[i].tensor->dataof;
1565				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1566			}
1567		}
1568	}
1569	// Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1570	// Avoiding refs that actually is an alias.
1571	for (i = 0; i < tensor_symbol_info_size; i++)
1572		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1573		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref)
1574		{
1575			int ref = tensor_blocks[i].ref - 1;
1576			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1577				ref = tensor_blocks[ref].ref - 1;
1578			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1578, __extension__ __PRETTY_FUNCTION__); }));
1579			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1580		}
1581	// Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1582	if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1583	{
1584		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1584, __extension__ __PRETTY_FUNCTION__
); }));
1585		const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1586		const int p_idx = graph_prep->p_idx - 1;
1587		for (i = 0; i < node->input_size; i++)
1588		{
1589			const int idx = node->inputs[i];
1590			int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx))) - 1;
1591			assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
 ({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1591, __extension__ __PRETTY_FUNCTION__); }));
1592			const int vt_ref = alloc_prep->vt_blocks[block_ref];
1593			if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1594				continue;
1595			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1595, __extension__ __PRETTY_FUNCTION__); }));
1596			const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1597			assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1597, __extension__ __PRETTY_FUNCTION__); }));
1598			assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1598, __extension__ __PRETTY_FUNCTION__
); }));
1599			// Either we have dup_tensor_block_ref in current layer, or we have that in
1600			// previous layer, therefore, cannot really find the buffer ptr.
1601			if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1602				((graph_prep->dup_tensor_block_ref &&
1603				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1604				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1605				 !tensor_arena->buffers[buffer_ref].ptr))
1606			{
1607				// We haven't allocated anything for this yet.
1608				assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
 ? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1608, __extension__ __PRETTY_FUNCTION__
); }));
1609				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1610				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1611				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1612			} else {
1613				const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1614				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1615				ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1616			}
1617		}
1618	}
1619	// For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1620	// This created the multi-view tensor to achieve that.
1621	for (i = 0; i < tensor_symbol_info_size; i++)
1622		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1623		{
1624			const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1625			// Create phi multi-view.
1626			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1627			const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1628			const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1629			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1630			ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1631			ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1632			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1633				intv,
1634				outv,
1635			}, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1636			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1637			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1638			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1639			ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1640		}
1641	// Now it is time to handle alias.
1642	for (i = 0; i < alloc_prep->block_size; i++)
1643		if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1644		{
1645			const int block_ref = alloc_prep->blocks[i].block_ref;
1646			if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref])
1647			{
1648				// Assigning out the tensor aliases.
1649				assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1649, __extension__ __PRETTY_FUNCTION__
); }));
1650				_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1651			}
1652		}
1653	// Now assigning out the rest of alias refs.
1654	for (i = 0; i < tensor_symbol_info_size; i++)
1655		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1656		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i])
1657		{
1658			int ref = tensor_blocks[i].alias_ref - 1;
1659			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1659, __extension__ __PRETTY_FUNCTION__); }));
1660			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1661		}
1662	// Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1663	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1664		if (tensor_arena->sub_arenas[i])
1665		{
1666			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1667			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1668			for (j = 0; j < node->input_size; j++)
1669			{
1670				const int idx = node->inputs[j];
1671				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1672				if (s_idx < 0)
1673					continue;
1674				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1675				// Only do the replacement if it is a multi-view tensor.
1676				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1677				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1678				{
1679					// It cannot be binded tensor.
1680					assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1680, __extension__ __PRETTY_FUNCTION__
); }));
1681					const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1682					const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1683					ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1684					// If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1685					// to this tensor.
1686					if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1687					{
1688						const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1689						ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1690						ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1691						ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1692						ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1693						ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
 : (multiview)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]);
1694						while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1695							tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
 ? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]);
1696						*ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1697						ref_tensor->data = tv->data;
1698						ref_tensor->dataof = tv->dataof;
1699						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1700					} else
1701						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1702				}
1703			}
1704		}
1705	// After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1706	// No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1707	// when initialize case..of node, which will take the phi multi-view again.
1708	for (i = 0; i < tensor_symbol_info_size; i++)
1709		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1710		{
1711			assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
 & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1711, __extension__ __PRETTY_FUNCTION__
); }));
1712			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1713			assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1713, __extension__ __PRETTY_FUNCTION__); }));
1714			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1715		}
1716	// rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1717	for (i = 0; i < tensor_symbol_info_size; i++)
1718		if (tensor_arena->vt_tensors[i])
1719			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1720	// Associate multiview tensors from sub arena to the parent.
1721	if (sub_arena_out_tensors)
1722	{
1723		for (i = 0; i < alloc_prep->block_size; i++)
1724			if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1725			{
1726				const int block_ref = alloc_prep->blocks[i].block_ref;
1727				if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1728					continue;
1729				int sub_arena_ref = block_ref;
1730				if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1731				{
1732					// Assigning out the tensor aliases.
1733					assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1733, __extension__ __PRETTY_FUNCTION__
); }));
1734					const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1735					// It referenced to is not an alias.
1736					assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
 0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1736, __extension__ __PRETTY_FUNCTION__
); }));
1737					sub_arena_ref = alias_ref;
1738					if (!sub_arena_out_tensors[sub_arena_ref])
1739						continue;
1740				}
1741				if (!sub_arena_out_tensors[sub_arena_ref])
1742					continue;
1743				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1744				assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1744, __extension__ __PRETTY_FUNCTION__); }));
1745				// This is only possible if the vt_tensors is a phi node.
1746				if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1747				{
1748					// For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1749					ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1750					assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
 ("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1750, __extension__ __PRETTY_FUNCTION__); }));
1751					assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
 ? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1751, __extension__ __PRETTY_FUNCTION__
); }));
1752					CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]->alias_ref = (uintptr_t)mv;
1753					ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]);
1754				} else {
1755					tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1756					ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1757				}
1758			}
1759	}
1760	// Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1761	// 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1762	// 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1763	// Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1764	// to the output of assign_ref tensor.
1765	for (i = 0; i < tensor_symbol_info_size; i++)
1766		if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1767		{
1768			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1769			ccv_nnc_tensor_t* assign_tensor;
1770			if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1771				assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1772			else
1773				assign_tensor = tensor_arena->vt_tensors[assign_ref];
1774			ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1775		}
1776	// After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1777	for (i = 0; i < tensor_bind_size; i++)
1778	{
1779		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1779, __extension__ __PRETTY_FUNCTION__
); }));
1780		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1781		if (resolved_symbol.d >= 0)
1782		{
1783			int d = resolved_symbol.d;
1784			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1785			// It has nothing to do with alias.
1786			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1787				d = tensor_blocks[d].ref - 1;
1788			// Note we don't trace back on alias. This is intentional.
1789			assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 ==
 tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ (
{ if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1789, __extension__ __PRETTY_FUNCTION__
); }));
1790		}
1791	}
1792	if (sub_arena_out_tensors)
1793		ccfreefree(sub_arena_out_tensors);
1794	// Rewire sub arena's tensor references.
1795	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1796		if (tensor_arena->sub_arenas[i])
1797		{
1798			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1799			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1800			for (j = 0; j < node->input_size; j++)
1801			{
1802				const int idx = node->inputs[j];
1803				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1804				if (s_idx < 0)
1805					continue;
1806				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1807				// Only do the replacement if it is a multi-view tensor.
1808				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1809				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1810				{
1811					// This is binded tensor, bind it now.
1812					if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1813						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1814					else
1815						_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1816				}
1817			}
1818		}
1819	return tensor_arena;
1820}
1821 
1822static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1823{
1824	assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
 ; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1824, __extension__ __PRETTY_FUNCTION__); }));
1825	if ((intptr_t)graph == tensor_arena->graph_ref)
1826	{
1827		assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >=
 0 && pair_ref < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1827, __extension__ __PRETTY_FUNCTION__
); }));
1828		return tensor_arena->vt_tensors[pair_ref];
1829	}
1830	int i;
1831	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1832		if (tensor_arena->sub_arenas[i])
1833		{
1834			ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1835			if (tensor)
1836				return tensor;
1837		}
1838	return 0;
1839}
1840 
1841static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1842{
1843	if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1844		tensor->type |= CCV_TAPE_ALLOC;
1845	else {
1846		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1847		mv->type |= CCV_TAPE_ALLOC;
1848		int i;
1849		for (i = 0; i < mv->repeat + mv->kind; i++)
1850			_ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1851	}
1852}
1853 
1854static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1855{
1856	assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
 __assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1856, __extension__ __PRETTY_FUNCTION__
); }));
1857	int i;
1858	for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1859	{
1860		if (graph_prep->tensor_symbol_info[i].pair_ref)
1861		{
1862			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1863			// No need to continue check this if it is from its pair.
1864			continue;
1865		}
1866		if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1867		{
1868			// If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1869			if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
))
1870			{
1871				const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1872				if (vt_ref >= 0 &&
1873					TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc) == READ_ONLY)
1874					continue;
1875			}
1876			_ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1877		}
1878	}
1879	for (i = 0; i < graph_prep->sub_prep_size; i++)
1880		if (graph_prep->sub_preps[i])
1881			_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1882}
1883 
1884static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1885{
1886	int i, found = 0;
1887	// Try to insert head.
1888	ccv_array_t* head = tensor_blocks.head;
1889	assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
 else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1889, __extension__ __PRETTY_FUNCTION__); }));
1890	for (i = 0; i < head->rnum;)
1891	{
1892		const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i)));
1893		if (head_idx == idx)
1894		{
1895			found = 1;
1896			break;
1897		}
1898		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1899		if (cell.i32 && cell.i32[0] > 0)
1900		{
1901			/* If the current node is the parent of the head node, check if we found it or not. */
1902			/* If not found, replace the current one. */
1903			if (!found)
1904			{
1905				found = 1;
1906				*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = idx;
1907			} else {
1908				/* Remove the current one, change the rnum. */
1909				if (i < head->rnum - 1)
1910					*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(head->rnum - 1)));
1911				--head->rnum;
1912				continue;
1913			}
1914		} else {
1915			// If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1916			cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1917			if (cell.i32 && cell.i32[0] > 0)
1918			{
1919				found = 1;
1920				break;
1921			}
1922		}
1923		/* Advancing i. */
1924		++i;
1925	}
1926	/* If not found, push this idx to the end of the array. */
1927	if (!found)
1928		ccv_array_push(head, &idx);
1929	// Try to insert tail.
1930	found = 0;
1931	ccv_array_t* tail = tensor_blocks.tail;
1932	assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
 else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1932, __extension__ __PRETTY_FUNCTION__); }));
1933	for (i = 0; i < tail->rnum;)
1934	{
1935		const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i)));
1936		if (tail_idx == idx)
1937		{
1938			found = 1;
1939			break;
1940		}
1941		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1942		if (cell.i32 && cell.i32[0] > 0)
1943		{
1944			/* If the current node is the child of the tail node, check if we found it or not. */
1945			/* If not found, replace the current one. */
1946			if (!found)
1947			{
1948				found = 1;
1949				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = idx;
1950			} else {
1951				/* Remove the current one, change the rnum. */
1952				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(tail->rnum - 1)));
1953				--tail->rnum;
1954				continue;
1955			}
1956		} else {
1957			// If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1958			cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1959			if (cell.i32 && cell.i32[0] > 0)
1960			{
1961				found = 1;
1962				break;
1963			}
1964		}
1965		/* Advancing i. */
1966		++i;
1967	}
1968	/* If not found, push this idx to the end of the array. */
1969	if (!found)
1970		ccv_array_push(tail, &idx);
1971}
1972 
1973ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1974{
1975	if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1976	{
1977		assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
 0 && symbol.d < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1977, __extension__ __PRETTY_FUNCTION__
); }));
1978		ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1979		if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1980		{
1981			ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1982			while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1983				mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1984			return (ccv_nnc_tensor_t*)mv;
1985		}
1986		return tensor;
1987	}
1988	int i;
1989	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1990		if (tensor_arena->sub_arenas[i])
1991		{
1992			ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1993			if (tensor)
1994				return tensor;
1995		}
1996	return 0;
1997}
1998 
1999ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
2000{
2001	if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
2002	{
2003		assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
 >= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 2003, __extension__ __PRETTY_FUNCTION__
); }));
2004		return graph_exec_arena->graph_execs[symbol.d];
2005	}
2006	int i;
2007	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
2008		if (graph_exec_arena->sub_arenas[i])
2009		{
2010			ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
2011			if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
2012				return exec;
2013		}
2014	return (ccv_nnc_graph_exec_t){}; // 0.
2015}
2016 
2017ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2018{
2019	return graph_exec_arena->source;
2020}
2021 
2022ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2023{
2024	return graph_exec_arena->destination;
2025}
2026 
2027// Check whether the head is the beginning of this block.
2028static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
2029{
2030	assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
 ({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 2030, __extension__ __PRETTY_FUNCTION__
); }));
2031	return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0))) == head_node);
2032}
2033 
2034// Check whether the tail is the end of this block.
2035static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2036{
2037	assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
 ({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 2037, __extension__ __PRETTY_FUNCTION__
); }));
2038	return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0))) == tail_node);
2039}
2040 
2041// Make two tensor blocks one. Return 1 if that happened.
2042static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2043{
2044	// Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2045	if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
2046		(!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
2047		tensor_blocks[p_ref_0].tail->rnum == 1 &&
2048		tensor_blocks[p_ref_1].head->rnum == 1 &&
2049		tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
2050		*(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
) == *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
))
2051	{
2052		// If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2053		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 2053, __extension__ __PRETTY_FUNCTION__); }));
2054		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 2054, __extension__ __PRETTY_FUNCTION__); }));
2055		ccv_array_free(tensor_blocks[p_ref_0].tail);
2056		tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2057		if (tensor_blocks[p_ref_1].p_refs[0])
2058		{
2059			assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
 0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
 0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2059, __extension__ __PRETTY_FUNCTION__
); })); // It simply cannot have more than one p_refs, otherwise we cannot merge.
2060			if (!tensor_blocks[p_ref_0].p_refs[0])
2061				tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2062			else
2063				tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2064		}
2065		tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2066		TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
 & ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)));
2067		ccv_array_free(tensor_blocks[p_ref_1].head);
2068		if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
2069			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
 | UNFOLDABLE_AS_INPUT));
2070		// Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2071		TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
 & ~0x3) | UNASSIGNED));
2072		tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2073		if (!tensor_blocks[p_ref_0].r_refs)
2074			tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2075		ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2076		tensor_blocks[p_ref_1].size = 0;
2077		tensor_blocks[p_ref_1].head = 0;
2078		tensor_blocks[p_ref_1].tail = 0;
2079		return 1;
2080	}
2081	return 0;
2082}
2083 
2084static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2085{
2086	int i, j, k;
2087	// Generate exec dependencies (or, in other words, partial ordering of executions).
2088	ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2089	int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2090	int buf_size;
2091	if (p_node_info)
2092		{ assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
 if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2092, __extension__ __PRETTY_FUNCTION__
); })); }
2093#define for_block(x, val) \
2094	do { \
2095		if (((int32_t*)val)[0] > 0) \
2096		{ \
2097			buf[buf_size * 2] = x; \
2098			buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2099			++buf_size; \
2100		} \
2101	} while (0)
2102	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx; {
2103		buf_size = 0; /* save all its parent deps to this buffer */
2104		ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2105		if (vector)
2106			CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
2107		if (!node->outgoings)
2108			continue;
2109		for (i = 0; i < node->outgoings->rnum; i++)
2110		{
2111			int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2112			const int32_t one = 1;
2113			ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2114			/* If not found, set, if the current node is the destination node, no need 
2115			 * set itself as parent of subsequent nodes because its terminal nature. */
2116			if (!cell.i32 || cell.i32[0] == 0)
2117				ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2118			if (buf_size > 0)
2119			{
2120				ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2121				assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector
) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c"
, 2121, __extension__ __PRETTY_FUNCTION__); }));
2122				for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
2123				{
2124					ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2125					/* If not found, set */
2126					if (!cell.i32 || cell.i32[0] == 0)
2127						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2128					else {
2129						/* Otherwise, set to the longest one */
2130						int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
 + 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; });
2131						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2132					}
2133				}
2134			}
2135		}
2136	} ccv_nnc_graph_visit_endfor} }
2137#undef for_block
2138	ccfreefree(buf);
2139	// This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2140	const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2141	ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2142	// The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2143	// happens that I have to loop through all relevant node to find out if one is used or not.
2144	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2145		tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2146	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2147		for (i = 0; i < node->input_size; i++)
2148			if (node->inputs[i] >= 0)
2149			{
2150				tensor_blocks[node->inputs[i]].flags = 0;
2151				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2152				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2153				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2154					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2155					tensor_blocks[node->inputs[i]].pin_mem = 1;
2156			}
2157		for (i = 0; i < node->output_size; i++)
2158			if (node->outputs[i] >= 0)
2159			{
2160				tensor_blocks[node->outputs[i]].flags = 0;
2161				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2162				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2163				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2164					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2165					tensor_blocks[node->outputs[i]].pin_mem = 1;
2166			}
2167	} ccv_nnc_graph_visit_endfor} }
2168	if (p_node_info)
2169	{
2170		assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
 ({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 2170, __extension__ __PRETTY_FUNCTION__
); }));
2171		// Mark it as used if it is used in either input or output.
2172		for (i = 0; i < p_node_info->input_size; i++)
2173			if (p_node_info->inputs[i] >= 0)
2174			{
2175				const int d = p_node_info->inputs[i];
2176				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2177				{
2178					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2179					if (dd >= 0) // If this exists in this sub-graph, great.
2180						tensor_blocks[dd].flags = 0;
2181				}
2182			}
2183		for (i = 0; i < p_node_info->output_size; i++)
2184			if (p_node_info->outputs[i] >= 0)
2185			{
2186				const int d = p_node_info->outputs[i];
2187				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2188				{
2189					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2190					if (dd >= 0) // If this exists in this sub-graph, great.
2191						tensor_blocks[dd].flags = 0;
2192				}
2193			}
2194	}
2195	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2196		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2197		{
2198			// Check no tensor info is auto now.
2199			assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 2199, __extension__ __PRETTY_FUNCTION__
); }));
2200			// If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2201			// therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2202			// fold to).
2203			if (tensor_symbol_info[i].assign_ref)
2204			{
2205				// TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2206				// It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2207				// it kept its own representation, which is not the case for output).
2208				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2209				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2210				// But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2211				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT));
2212				// It also cannot be folded as output (except i), because we need to keep its own representation.
2213				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2214				assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
 == 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2214, __extension__ __PRETTY_FUNCTION__
); }));
2215				tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2216				for (j = 0; j < unroll_count; j++)
2217				{
2218					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT));
2219					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT));
2220				}
2221				if (tensor_blocks[assign_ref].bypass_ref)
2222				{
2223					// If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2224					tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2225					const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2226					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT));
2227					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2228					// On the other hand, it can be folded into the except_ref for the bypass_ref.
2229					tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2230					if (dup_tensor_from_ref)
2231					{
2232						const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2233						if (bypass_from_ref >= 0)
2234						{
2235							TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT));
2236							TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT));
2237							assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
 + unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
 - 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2237, __extension__ __PRETTY_FUNCTION__
); }));
2238							for (j = 0; j < unroll_count - 1; j++)
2239							{
2240								// Mark every incarnation as unfold-able.
2241								TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_INPUT));
2242								TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT));
2243							}
2244						}
2245					}
2246				}
2247			}
2248		}
2249	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2250	{
2251		// If it has a pair reference, we don't need to allocate this tensor at all,
2252		// set it to be unassigned.
2253		if (tensor_symbol_info[i].pair_ref)
2254			TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED));
2255		// If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2256		else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2257			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2258			TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2259			// For this case, there is no exception.
2260			tensor_blocks[i].unfoldable_except_ref = 0;
2261		} else if (tensor_symbol_info[i].p_ref) {
2262			assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 2262, __extension__ __PRETTY_FUNCTION__); }));
2263			const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2264			// If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2265			if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2266				// TODO: This check can be lifted if we can fold in the parent graph.
2267				if (-1 == p_ref_is_in_or_out)
2268					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2269			if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2270				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2271		}
2272	}
2273	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2274	{
2275		if (tensor_symbol_info[i].alias_ref)
2276		{
2277			const int ref = tensor_symbol_info[i].alias_ref - 1;
2278			// If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2279			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2280				tensor_blocks[ref].flags = 0;
2281			// An alias cannot ref to another alias.
2282			assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
 __assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2282, __extension__ __PRETTY_FUNCTION__); }));
2283			tensor_blocks[i].flags = ALIAS;
2284			tensor_blocks[i].ref = ref + 1; // Assign the ref.
2285			if (!tensor_blocks[ref].r_refs)
2286				tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2287			ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2288		}
2289	}
2290	// Scan again and if the ref is not assigned, mark the alias not assigned.
2291	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2292		if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2293		{
2294			const int ref = tensor_blocks[i].ref - 1;
2295			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2296			{
2297				// Mark this as unassigned.
2298				tensor_blocks[i].flags = UNASSIGNED;
2299				tensor_blocks[i].ref = 0;
2300			}
2301		}
2302	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2303	{
2304		// If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2305		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
2306		{
2307			tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2308			tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2309			// Cache tensor size (align to 16 bytes).
2310			tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2311		}
2312		// If there is a p_ref, add the one to the p_refs list.
2313		if (tensor_symbol_info[i].p_ref)
2314			tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2315	}
2316	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2317		for (i = 0; i < node->input_size; i++)
2318		{
2319			int d = node->inputs[i];
2320			if (d < 0)
2321				continue;
2322			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2323				d = tensor_symbol_info[d].alias_ref - 1;
2324			tensor_blocks[d].flags |= READ_ONLY;
2325			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2326				continue;
2327			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2327, __extension__ __PRETTY_FUNCTION__
); }));
2328			/* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2329			 * from the very beginning of the graph life-cycle and ends here. */
2330			if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
2331			{
2332				for (j = 0; j < source_size; j++)
2333				{
2334					// If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2335					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2336					if (cell.i32 && cell.i32[0] > 0)
2337						_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2338				}
2339				/* If this is a read-only (based on SSA, if first encountered as read), and this is
2340				 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2341				 * loop, however, in that case, you need to prevent read-only gets reused for the
2342				 * output tensor, which is not obvious how to implement correctly), and it is not
2343				 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2344				 * of memory anyway (because on second loop, we want to read the same value out).
2345				 * Mark it to the end of the graph. */
2346				if (p_node_info && !tensor_symbol_info[d].assign_ref)
2347					for (j = 0; j < destination_size; j++)
2348					{
2349						// If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2350						const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2351						if (cell.i32 && cell.i32[0] > 0)
2352							_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2353					}
2354			}
2355			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2356		}
2357		for (i = 0; i < node->output_size; i++)
2358		{
2359			int d = node->outputs[i];
2360			if (d < 0)
2361				continue;
2362			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2363				d = tensor_symbol_info[d].alias_ref - 1;
2364			tensor_blocks[d].flags |= WRITE_ONLY;
2365			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2366				continue;
2367			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2367, __extension__ __PRETTY_FUNCTION__
); }));
2368			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2369		}
2370	} ccv_nnc_graph_visit_endfor} }
2371	// For any assign_ref, its life-time kept until the end and wrap over.
2372	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2373		// If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2374		// that "somewhere else" need to keep its life-time til the end.
2375		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) &&
2376			p_node_info && tensor_symbol_info[i].assign_ref)
2377		{
2378			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2379			for (j = 0; j < destination_size; j++)
2380			{
2381				// This logic is to be more conservative about which destination we add to.
2382				// As of now, if we add everything, it is fine most likely. However, it may
2383				// cause issues in the future to do so naively. Thus, instead, we only add
2384				// the destination to it iff either the tensor is not used at all, or, the
2385				// destination is on the same stream as of the tensor block some way.
2386				int flag = !tensor_blocks[assign_ref].tail;
2387				for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2388				{
2389					const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
 + (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)));
2390					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2391					flag = (cell.i32 && cell.i32[0] > 0);
2392				}
2393				if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2394					_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2395			}
2396		}
2397	for (i = 0; i < output_size; i++)
2398	{
2399		assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
 __assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2399, __extension__ __PRETTY_FUNCTION__); }));
2400		int d = outputs[i].d;
2401		if (d < 0)
2402			continue;
2403		if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2404			d = tensor_symbol_info[d].alias_ref - 1;
2405		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2406			continue;
2407		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2407, __extension__ __PRETTY_FUNCTION__
); }));
2408		for (j = 0; j < destination_size; j++)
2409		{
2410			int flag = !tensor_blocks[d].tail;
2411			for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2412			{
2413				const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
2414				const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2415				flag = (cell.i32 && cell.i32[0] > 0);
2416			}
2417			if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2418				_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2419		}
2420	}
2421	// Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2422	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2423		int x, y;
2424		for (x = 0; x < node->input_size; x++)
2425			for (y = 0; y < node->output_size; y++)
2426				/* Some operations enforces some tensors to be the same for inputs / outputs. */
2427				if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2428				{
2429					// If both unassigned, it is fine.
2430					if (node->inputs[x] < 0 && node->outputs[y] < 0)
2431						continue;
2432					int ref = node->inputs[x];
2433					assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2433, __extension__ __PRETTY_FUNCTION__); }));
2434					while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2435						ref = tensor_blocks[ref].ref - 1;
2436					const int node_output_y = node->outputs[y];
2437					assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
 ({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2437, __extension__ __PRETTY_FUNCTION__
); }));
2438					// If both are not computable, it is fine, we don't need to enforce.
2439					if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2440						!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
)))
2441						continue;
2442					// Otherwise, enforce and error out if failed.
2443					if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2444						{ assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2444, __extension__ __PRETTY_FUNCTION__
); })); }
2445				}
2446	} ccv_nnc_graph_visit_endfor} }
2447	// Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2448	// we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2449	// that is not enforced in-place (because the tensor enforced in-place will be different than the
2450	// binding one).
2451	for (i = 0; i < tensor_bind_size; i++)
2452	{
2453		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2454		// If there is a tensor binded, then it is unassigned.
2455		if (resolved_symbol.d >= 0)
2456		{
2457			int d = resolved_symbol.d;
2458			// I cannot assert too much at this moment.
2459			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2460				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2461			// This check is for in-place ops. Only in-place op could have unassigned but ref.
2462			// It has nothing to do with alias.
2463			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2464				d = tensor_blocks[d].ref - 1;
2465			// Doesn't work if this is a loop carrying variable.
2466			assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
 __extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
 __assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2466, __extension__ __PRETTY_FUNCTION__); }));
2467			tensor_blocks[d].flags = UNASSIGNED;
2468			tensor_blocks[d].ref = 0; // No need to have ref as well.
2469		}
2470	}
2471	// Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2472	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2473		int x, y;
2474		for (x = 0; x < node->input_size; x++)
2475		{
2476			/* If the input is not assigned, it can be referenced, find the referenced one */
2477			int ref = node->inputs[x];
2478			if (ref < 0)
2479				continue;
2480			const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2481			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2482				ref = tensor_blocks[ref].ref - 1;
2483			assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
 ({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2483, __extension__ __PRETTY_FUNCTION__
); }));
2484			if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2485				tensor_blocks[ref].tail->rnum == 1)
2486			{
2487				for (y = 0; y < node->output_size; y++)
2488					/* Only proceed if the input symbol is different from the output symbol, */
2489					/* and the input symbol meets the output symbol exactly at the same spot. */
2490					if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2491						node->outputs[y] >= 0 &&
2492						ref != node->outputs[y] &&
2493						TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
 0x3) == UNASSIGNED)))
2494					{
2495						const int node_output_y = node->outputs[y];
2496						const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2497						/* If dimension matches perfectly, then we can assign y_symbol to x.
2498						 * If both of them are aliases, making sure their origin matches in size too. */
2499						if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
2500						{
2501							_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2502							// This refers to an alias itself, now mark it and will be processed later.
2503							if (ref != node->inputs[x])
2504								tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2505						}
2506					}
2507			}
2508		}
2509	} ccv_nnc_graph_visit_endfor} }
2510	// Specifically handle the bypass. This need to be done after the first pass.
2511	// I need to extend the bypass life-time to the same as the one I am going with.
2512	// It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2513	ccv_nnc_tensor_block_t empty_block = {};
2514	empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2515	empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2516	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2517		if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2518		{
2519			int can_bypass = 1;
2520			for (i = 0; can_bypass && i < node->output_size; i++)
2521			{
2522				int d = node->outputs[i];
2523				if (d < 0)
2524					continue;
2525				if (!tensor_blocks[d].bypass_ref)
2526					continue;
2527				while (tensor_blocks[d].ref)
2528					d = tensor_blocks[d].ref - 1;
2529				int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2530				while (tensor_blocks[bypass_ref].ref)
2531					bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2532				// If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2533				if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2534					continue;
2535				ccv_array_clear(empty_block.head);
2536				for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2537					ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
 + (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j))));
2538				ccv_array_clear(empty_block.tail);
2539				for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2540					ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
 + (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j))));
2541				for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2542					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), empty_block);
2543				for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2544					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), empty_block);
2545				// It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2546				assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
 ({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
 ("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2546, __extension__ __PRETTY_FUNCTION__
); }));
2547				int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2548				while (tensor_blocks[b_ref].ref)
2549					b_ref = tensor_blocks[b_ref].ref - 1;
2550				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2551				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2552				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2553				// even after we extend the life-time of bypass_ref. Then we are in a good shape.
2554				can_bypass = can_bypass && (a_hop_b || b_hop_a);
2555			}
2556			if (can_bypass)
2557			{
2558				for (i = 0; i < node->output_size; i++)
2559				{
2560					int d = node->outputs[i];
2561					if (d < 0)
2562						continue;
2563					if (!tensor_blocks[d].bypass_ref)
2564						continue;
2565					while (tensor_blocks[d].ref)
2566						d = tensor_blocks[d].ref - 1;
2567					int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2568					while (tensor_blocks[bypass_ref].ref)
2569						bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2570					// The bypass_ref can extend its life-time.
2571					for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2572						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2573					for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2574						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2575				}
2576			} else {
2577				for (i = 0; i < node->output_size; i++)
2578					tensor_blocks[node->outputs[i]].bypass_ref = 0;
2579				const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2580				// Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2581				exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2582			}
2583		}
2584	} ccv_nnc_graph_visit_endfor} }
2585	ccv_array_free(empty_block.head);
2586	ccv_array_free(empty_block.tail);
2587	*r_exec_dep = exec_dep;
2588	*r_tensor_blocks = tensor_blocks;
2589}
2590 
2591static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2592{
2593	if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2594	{
2595		ccv_nnc_cmd_t retval = cmd;
2596		retval.cmd = CCV_NNC_NOOP;
2597		return retval;
2598	}
2599	return cmd;
2600}
2601 
2602static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2603{
2604	if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2605	{
2606		if (tensor_symbol_info[input].alias_ref)
2607		{
2608			const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2609			assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2609, __extension__ __PRETTY_FUNCTION__
); }));
2610			ccv_nnc_tensor_symbol_t tensor_symbol = {};
2611			if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2612			{
2613				tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2614				if (tensor_symbol_info[alias_ref].pair_ref)
2615					ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2616						.d = tensor_symbol_info[alias_ref].pair_ref - 1,
2617						.graph = dup_graph->pair
2618					});
2619				ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2620				dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2621			} else {
2622				tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2623				tensor_symbol.graph = dup_graph;
2624			}
2625			ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2626			if (tensor_symbol_info[input].pair_ref)
2627				ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2628					.d = tensor_symbol_info[input].pair_ref - 1,
2629					.graph = dup_graph->pair
2630				});
2631			ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2632			dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2633		} else {
2634			ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2635			if (tensor_symbol_info[input].pair_ref)
2636				ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2637					.d = tensor_symbol_info[input].pair_ref - 1,
2638					.graph = dup_graph->pair
2639				});
2640			ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2641			dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2642		}
2643		if (tensor_symbol_info[input].bypass_ref)
2644		{
2645			const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2646			assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
 ({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2646, __extension__ __PRETTY_FUNCTION__
); }));
2647			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])));
2648			symbol_info->bypass_ref = dup_bypass_ref + 1;
2649		}
2650	}
2651	return (ccv_nnc_tensor_symbol_t) {
2652		.d = dup_tensor_block_ref[input * unroll_count],
2653		.graph = dup_graph,
2654	};
2655}
2656 
2657static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2658{
2659	int i;
2660	if (dup_exec_ref[idx * unroll_count] < 0)
2661	{
2662		// Input has to come before output, because output could has a bypass reference to the input.
2663		for (i = 0; i < node->input_size; i++)
2664			max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2665		for (i = 0; i < node->output_size; i++)
2666			max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2667		ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2668		dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2669	}
2670	return (ccv_nnc_graph_exec_symbol_t) {
2671		.d = dup_exec_ref[idx * unroll_count],
2672		.graph = dup_graph,
2673	};
2674}
2675 
2676static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2677{
2678	int i;
2679	for (i = 0; i < tensor_block_size; i++)
2680	{
2681		if (tensor_blocks[i].head)
2682			ccv_array_free(tensor_blocks[i].head);
2683		if (tensor_blocks[i].tail)
2684			ccv_array_free(tensor_blocks[i].tail);
2685		if (tensor_blocks[i].r_refs)
2686			ccv_array_free(tensor_blocks[i].r_refs);
2687		if (tensor_blocks[i].dup_p_refs)
2688			ccv_array_free(tensor_blocks[i].dup_p_refs);
2689	}
2690	ccfreefree(tensor_blocks);
2691}
2692 
2693// Find tensors that cannot be solved by co-allocating to the same location.
2694static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2695{
2696	int i, j, unroll_count = 0;
2697	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2698		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2699		{
2700			// This is is a parameter, thus, it has to be either an alias or used.
2701			assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
 & 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
 ("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2701, __extension__ __PRETTY_FUNCTION__
); }));
2702			const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2703			// The parameter it assign to has to be either an alias or used.
2704			assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2704, __extension__ __PRETTY_FUNCTION__
); }));
2705			// If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2706			// If it is the same, we are good, no need to extend.
2707			int a_ref = i;
2708			while (tensor_blocks[a_ref].ref)
2709				a_ref = tensor_blocks[a_ref].ref - 1;
2710			int b_ref = assign_ref;
2711			while (tensor_blocks[b_ref].ref)
2712				b_ref = tensor_blocks[b_ref].ref - 1;
2713			if (a_ref != b_ref)
2714			{
2715				// If any of the b's head is deterministically later than a's tail
2716				// or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2717				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2718				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2719				// It cannot be that both i can hop to j can j can hop to i.
2720				assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
 ? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
 > 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2720, __extension__ __PRETTY_FUNCTION__
); }));
2721				// Can it be folded
2722				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2723				if (a_hop_b || b_hop_a)
2724				{
2725					tensor_blocks[a_ref].companion_ref = b_ref + 1;
2726					tensor_blocks[b_ref].companion_ref = a_ref + 1;
2727					continue;
2728				}
2729				int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2730				for (j = 0; c_ref >= 0; j++)
2731				{
2732					while (tensor_blocks[c_ref].ref)
2733						c_ref = tensor_blocks[c_ref].ref - 1;
2734					c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2735				}
2736				unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
 = (j + 1); (_a > _b) ? _a : _b; });
2737			}
2738		}
2739	// Reset companion_ref if need to unroll.
2740	if (unroll_count)
2741		for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2742			tensor_blocks[j].companion_ref = 0;
2743	return unroll_count;
2744}
2745 
2746static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2747{
2748	int i, j, n;
2749	// The inout exec nodes, these are the nodes we are going to extend.
2750	uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2751	int max_input_size = 0;
2752	int max_output_size = 0;
2753	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2754	{
2755		max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; });
2756		max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; });
2757	}
2758	ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
2759	ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
2760	// Doing graph expansion
2761	// It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2762	assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2762, __extension__ __PRETTY_FUNCTION__
); }));
2763	assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2763, __extension__ __PRETTY_FUNCTION__
); }));
2764#define INCOMING_NODE (1)
2765#define OUTGOING_NODE (2)
2766	// Unroll the graph n times.
2767	for (n = 0; n < unroll_count; n++)
2768	{
2769		int* const dup_exec_ref = r_dup_exec_ref + n;
2770		const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2771		int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2772		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2773			dup_exec_ref[i * unroll_count] = -1;
2774		for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2775		{
2776			// If there is a assign_ref, that means I don't need to dup the tensor.
2777			if (tensor_symbol_info[i].assign_ref)
2778			{
2779				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2780				dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2781			} else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2782			// If this is a read-only tensor block, no need to duplicate because the value never changes
2783			// (note we handled assign_ref first), therefore, no need to generate duplicate.
2784				dup_tensor_block_ref[i * unroll_count] = i;
2785			else
2786				dup_tensor_block_ref[i * unroll_count] = -1;
2787		}
2788		// Go through the original graph, make copies of the node if it is inout.
2789		ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2790			ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2791			inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2792			if (!node->outgoings)
2793				continue;
2794			for (i = 0; i < node->outgoings->rnum; i++)
2795			{
2796				const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2797				inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2798				ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2799				ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2800			}
2801		} ccv_nnc_graph_visit_endfor} }
2802		// Check the visitor are all marked as either incoming or outgoing.
2803		const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2804		const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2805		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2806		{
2807			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2808				continue;
2809			assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
 OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
 INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
 ("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2809, __extension__ __PRETTY_FUNCTION__
); }));
2810			// If this is pure incoming nodes, then I need to concat this one with all original destination node
2811			if (inout[i] == INCOMING_NODE)
2812				for (j = 0; j < dup_destination_size; j++)
2813				{
2814					ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2815						.d = dup_destinations[j].d,
2816						.graph = dup_graph,
2817					}, (ccv_nnc_graph_exec_symbol_t) {
2818						.d = dup_exec_ref[i * unroll_count],
2819						.graph = dup_graph,
2820					});
2821				}
2822		}
2823		if (dup_graph->destinations)
2824			ccv_array_clear(dup_graph->destinations);
2825		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2826		{
2827			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2828				continue;
2829			const int d = dup_exec_ref[i * unroll_count];
2830			ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
 + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)));
2831			// If this has no outgoing node, add to the destination.
2832			if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2833				ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2834					.graph = dup_graph,
2835					.d = d,
2836				});
2837		}
2838	}
2839#undef INCOMING_NODE
2840#undef OUTGOING_NODE
2841	ccfreefree(inout);
2842}
2843 
2844static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2845{
2846	int i;
2847	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2848		// Now can assign them (The dup) as companion.
2849		// Get to the last one, which we will wrap over.
2850		if (dup_tensor_symbol_info[i].assign_ref)
2851		{
2852			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2853			dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2854			assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
 ; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2854, __extension__ __PRETTY_FUNCTION__
); }));
2855			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2856		}
2857}
2858 
2859// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2860// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2861// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2862static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2863{
2864	int i, j, k;
2865	for (i = 0; i < p_node_info->output_size; i++)
2866	{
2867		const int d = p_node_info->outputs[i];
2868		const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx))) - 1;
2869		if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED)))
2870			continue;
2871		for (k = 0; k < destination_size; k++)
2872			_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2873		// Add the duplicated destinations to the tensor_block_ref.
2874		for (j = 0; j < unroll_count; j++)
2875			for (k = 0; k < destination_size; k++)
2876			{
2877				const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2878				const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2879				if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2880					_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2881			}
2882	}
2883}
2884 
2885static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2886{
2887	int i, j;
2888	ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2889	ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2890	// blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2891	// Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2892	// No need to change anything, we are good.
2893	const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2894	if (!unroll_count)
2895		return;
2896	// Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2897	// Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2898	ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2899	int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2900	int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2901	_ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2902	ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2903	ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2904	ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
(dup_graph->exec_symbol_info->rnum) + _incoming_edges_ >
 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
 = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = {
 (dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1
; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for
 (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
 _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue
; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r =
 1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) {
 ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph
->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1
; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_
[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((
_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum
)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->destinations->rnum); _exist_size_[1] =
 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
 for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
 = 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
 int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
 6 && _d_ < (dup_graph->destinations->rnum))
 { _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
 < ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 6 && _d_ <
 (dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0
), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->
rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
 ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
 ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
 ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
 <= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
 ({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
2905	ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2906	_ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2907	// Free out the old exec_dep
2908	ccv_matrix_free(exec_dep);
2909	// and the tensor blocks, prepare for the new.
2910	_ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2911	// A reverse map to find where the original tensor comes from.
2912	int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2913	for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2914		dup_tensor_from_ref[i] = -1;
2915	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2916		for (j = 0; j < unroll_count; j++)
2917			if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2918				dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2919	int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2920	for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2921		dup_exec_from_ref[i] = -1;
2922	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2923	{
2924		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2925			continue;
2926		dup_exec_from_ref[i] = i; // Reference back.
2927		for (j = 0; j < unroll_count; j++)
2928			if (dup_exec_ref[i * unroll_count + j] >= 0)
2929				dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2930	}
2931	// Reset all attr.
2932	memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2933	_ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2934	ccv_nnc_graph_visit_free(dup_visit);
2935	ccfreefree(dup_exec_symbol_info);
2936	ccfreefree(dup_exec_from_ref);
2937	ccfreefree(dup_tensor_from_ref);
2938	// Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2939	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2940		// Loop over all possible duplications to assign dup_p_ref properly.
2941		for (j = 0; j < unroll_count; j++)
2942		{
2943			const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2944			if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2945			{
2946				const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2947				const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2948				if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2949				{
2950					if (!tensor_blocks[dup_idx].dup_p_refs)
2951						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2952					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2953				}
2954				if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2955					continue;
2956				const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2957				const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2958				if (p_ref_1_is_in_or_out == 1)
2959				{
2960					if (!tensor_blocks[dup_idx].dup_p_refs)
2961						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2962					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2963				}
2964			}
2965		}
2966	// companion_ref
2967	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2968		// Now can assign them (The dup) as companion.
2969		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2970		{
2971			// Get to the last one, which we will wrap over.
2972			const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2973			if (assign_ref >= 0)
2974			{
2975				int b_ref = assign_ref;
2976				while (tensor_blocks[b_ref].ref)
2977					b_ref = tensor_blocks[b_ref].ref - 1;
2978				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2979				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2980				// It cannot be that both i can hop to j can j can hop to i.
2981				// And it can be hop from one to another now after duplication.
2982				assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
 ({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
 ("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2982, __extension__ __PRETTY_FUNCTION__); }));
2983				tensor_blocks[i].companion_ref = b_ref + 1;
2984				tensor_blocks[b_ref].companion_ref = i + 1;
2985			}
2986		}
2987	ccfreefree(dup_tensor_symbol_info);
2988	// Extend the dup tensor block ref, prepare for future extensions.
2989	dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2990	for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2991		dup_tensor_block_ref[i] = -1;
2992	// Assign out changed properties.
2993	*r_exec_dep = exec_dep;
2994	*r_tensor_blocks = tensor_blocks;
2995	*r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2996	*r_dup_graph = dup_graph;
2997	*r_unroll_count = unroll_count;
2998	*r_dup_exec_ref = dup_exec_ref;
2999	*r_dup_tensor_block_ref = dup_tensor_block_ref;
3000}
3001 
3002static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
3003{
3004	if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
3005		return tensor_block_size;
3006	int i;
3007	const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
3008	int found_idx = tensor_block_size;
3009	for (i = 0; i < anonymous_block_free_list_cap; i++)
3010	{
3011		const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)));
3012		assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
 ({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 3012, __extension__ __PRETTY_FUNCTION__
); }));
3013		// If the type doesn't match, ignore.
3014		if (tensor_blocks[idx].type != type)
3015			continue;
3016		// Heuristic about how to select the best tensor block to move forward.
3017		// If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
3018		if (tensor_blocks[idx].size >= size)
3019		{
3020			if (no_dup_p_refs)
3021				return idx;
3022			// Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
3023			// then we cannot do better than this, if that is the case, just return.
3024			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
3025				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
3026				return idx;
3027		}
3028		int64_t found_idx_size_diff;
3029		int64_t idx_size_diff;
3030		if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
3031			// Now, compare whether this one or the found_idx one is better.
3032			// At this point, there is no point of comparing the dup_p_refs, we only care about which one
3033			// is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
3034			(found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3035		{
3036			found_idx = idx;
3037			continue;
3038		}
3039		// No need to update if found_idx is better than idx.
3040		if (found_idx_size_diff > idx_size_diff)
3041			continue;
3042		// We bias towards the bigger one in case of similar.
3043		if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3044		{
3045			found_idx = idx;
3046			continue;
3047		}
3048		assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
 == tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 3048, __extension__ __PRETTY_FUNCTION__
); }));
3049		// On a tie, check which one has tighter life-cycle.
3050		if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3051		{
3052			// Check whether the current tensor blocks life-cycle is longer than the previous one.
3053			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3054				(!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3055				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3056				found_idx = idx;
3057			continue;
3058		}
3059		// Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3060		// We prefer to choose the one that has life-cycle closer to the expected ones.
3061		if (no_dup_p_refs)
3062		{
3063			// Whoever is shorter wins.
3064			if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3065				(!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3066				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3067				found_idx = idx;
3068			continue;
3069		}
3070		if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3071			continue;
3072		if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3073		{
3074			found_idx = idx;
3075			continue;
3076		}
3077		// If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3078		const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3079		const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3080		if (idx_after_request && found_idx_after_request)
3081		{
3082			if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3083				found_idx = idx;
3084			continue;
3085		} else {
3086			// We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3087			// If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3088			// Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3089			if (!found_idx_after_request && (idx_after_request ||
3090				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3091				found_idx = idx;
3092			continue;
3093		}
3094	}
3095	return found_idx;
3096}
3097 
3098static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3099{
3100	if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3101		return 0;
3102	int i, j, k;
3103	int input_size = 0;
3104	for (i = 0; i < p_node_info->p_while.input_size; i++)
3105		if (p_node_info->p_while.inputs[i] >= 0)
3106			++input_size;
3107	// If doesn't have tensor inputs (thus, only special inputs), just return.
3108	if (!input_size)
3109		return 0;
3110	ccv_nnc_tensor_symbol_t inputs[input_size];
3111	input_size = 0;
3112	for (i = 0; i < p_node_info->p_while.input_size; i++)
3113		if (p_node_info->p_while.inputs[i] >= 0)
3114			inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3115				.d = p_node_info->p_while.inputs[i],
3116				.graph = symbolic_graph,
3117			};
3118	assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
 1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
 > 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3118, __extension__ __PRETTY_FUNCTION__
); }));
3119	ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3120	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3121	for (i = 0; i < symbolic_graph->breakpoint_size; i++)
3122	{
3123		// Make a noop copy of the breakpoint, but with some tensor inputs.
3124		ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
3125		ccv_array_push(dup_breakpoints, &noop);
3126		// Connect this noop to the outgoing nodes of breakpoints.
3127		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(symbolic_graph->breakpoints[i].d)));
3128		if (symbol_info->outgoings)
3129			for (j = 0; j < symbol_info->outgoings->rnum; j++)
3130			{
3131				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3132				ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3133					.d = d,
3134					.graph = symbolic_graph,
3135				});
3136			}
3137	}
3138	for (i = 0; i < exec_symbol_info_size; i++)
3139	{
3140		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
3141		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
3142			continue;
3143		if (symbol_info->outgoings)
3144		{
3145			const int outgoing_size = symbol_info->outgoings->rnum;
3146			for (j = 0; j < outgoing_size; j++)
3147			{
3148				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3149				for (k = 0; k < symbolic_graph->breakpoint_size; k++)
3150					if (d == symbolic_graph->breakpoints[k].d)
3151					{
3152						ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)));
3153						ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3154							.d = i,
3155							.graph = symbolic_graph,
3156						}, noop);
3157						// Found, connected, exit.
3158						break;
3159					}
3160			}
3161		}
3162	}
3163	// Add the dup_breakpoints to source if neccessary.
3164	assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3164, __extension__ __PRETTY_FUNCTION__
); }));
3165	const int source_size = symbolic_graph->sources->rnum;
3166	for (i = 0; i < source_size; i++)
3167	{
3168		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i))))->d;
3169		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3170			if (d == symbolic_graph->breakpoints[j].d)
3171			{
3172				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3173				ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3174				// Found, made, exit.
3175				break;
3176			}
3177	}
3178	// Add the dup_breakpoints to destination if neccessary.
3179	assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3179, __extension__ __PRETTY_FUNCTION__); }));
3180	const int destination_size = symbolic_graph->destinations->rnum;
3181	for (i = 0; i < destination_size; i++)
3182	{
3183		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i))))->d;
3184		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3185			if (d == symbolic_graph->breakpoints[j].d)
3186			{
3187				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3188				ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3189				// Found, made, exit.
3190				break;
3191			}
3192	}
3193	return dup_breakpoints;
3194}
3195 
3196// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3197static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3198{
3199	assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3199, __extension__ __PRETTY_FUNCTION__
); }));
3200	assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
 ({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3200, __extension__ __PRETTY_FUNCTION__
); }));
3201	// First, fill all the "auto" holes.
3202	// This is the symbol table that with "auto" info filled up.
3203	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3204	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3205	ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3206	ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3206, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3206, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3206, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
3207	ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3208	int i, j, k, p, q;
3209	const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
3210	ccv_sparse_matrix_t* exec_dep;
3211	ccv_nnc_tensor_block_t* tensor_blocks;
3212	_ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3213	int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3214	// Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3215	// are automatically filled in, and all the sub-graphs are processed.
3216	// There is a last step though, for a while loop, it is parameterized:
3217	// while (x > 5) {
3218	//     y = x + 1;
3219	// } (y => x) // This means after this loop is done, y's value will be copied over to x.
3220	// we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3221	// If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3222	// it is a inplace operation.
3223	// But if y cannot be x's alias, for example, this while loop looks like this:
3224	// while (x > 5) {
3225	//     y = x + a
3226	//     b = x + y
3227	// } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3228	// For this example, y cannot be x's alias because x is used later to compute b (and that computation
3229	// has dependency on y as well).
3230	// For this case, we need to modify the computation graph. Previously, the graph looks like this:
3231	// y = x + a -> b = x + y
3232	// This graph will be extended to look like this:
3233	// y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3234	// while (x0 > 5) {
3235	//     y0 = x0 + a0
3236	//     b0 = x0 + y0
3237	//     if (y0 > 5) break
3238	//     y1 = y0 + b0
3239	//     b1 = y0 + y1
3240	// } (y1 => x0, b1 => a0)
3241	// After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3242	// with each other now).
3243	// With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3244	// which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3245	ccv_nnc_symbolic_graph_t* dup_graph = 0;
3246	int* dup_exec_ref = 0;
3247	int* dup_tensor_block_ref = 0;
3248	int unroll_count = 0;
3249	// In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3250	ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
3251	prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3252	prep->flags = 0;
3253	// Cannot handle dup a node that is a graph as well.
3254	if (p_exec_symbol_info)
3255	{
3256		prep->flags = p_node_info->flags;
3257		if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3258		{
3259			_ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3260			_ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0))), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3261		} else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3262			// TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3263		}
3264	}
3265	ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
3266	ccv_array_t* anonymous_block_free_list = 0;
3267	const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3268	// Record whether this tensor is folded in this round.
3269	uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
3270	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
3271		for (p = 0; p < node->graph_ref_size; p++)
3272		{
3273			assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 3273, __extension__ __PRETTY_FUNCTION__); }));
3274			ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)));
3275			ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3276			ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0))), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0))), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3277			sub_prep->dup_breakpoints = dup_breakpoints;
3278			sub_prep->p = prep;
3279			sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1] = sub_prep;
3280			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3281			const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3282			for (i = 0; i < s_alloc_prep->block_size; i++)
3283			{
3284				const int block_ref = s_alloc_prep->blocks[i].block_ref;
3285				const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3286				if (block_ref < sub_prep->tensor_symbol_info_size)
3287				{
3288					// If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3289					// I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3290					if (s_tensor_blocks[block_ref].bypass_ref)
3291					{
3292						int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3293						while (s_tensor_blocks[bypass_ref].ref)
3294							bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3295						if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3296							s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3297							continue;
3298					}
3299					if (s_tensor_blocks[block_ref].p_refs[0])
3300					{
3301						/* If it is already properly assigned, next. */
3302						if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3303							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3304						{
3305							if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3306								s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3307							else {
3308								assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3308, __extension__ __PRETTY_FUNCTION__
); }));
3309								s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3310							}
3311						}
3312						/* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3313						if (s_tensor_blocks[block_ref].p_refs[1] &&
3314							s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3315							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3316						{
3317							assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3317, __extension__ __PRETTY_FUNCTION__
); }));
3318							assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3318, __extension__ __PRETTY_FUNCTION__
); }));
3319							s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3320						}
3321					}
3322				} else if (s_tensor_blocks[block_ref].dup_p_refs) {
3323					/* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3324					 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3325					 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3326					 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3327					 * its life-time to the end of the output tensor. */
3328					if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3329						s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3330					for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3331						ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j))));
3332				}
3333			}
3334		}
3335		const int init_tensor_block_size = tensor_block_size;
3336		int rw_anonymous_buffer_size_cap = 0;
3337		int ro_anonymous_buffer_size_cap = 0;
3338		if (anonymous_block_free_list)
3339			ccv_array_clear(anonymous_block_free_list);
3340		memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3341		for (p = 0; p < node->graph_ref_size; p++)
3342		{
3343			ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1];
3344			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3345			int rw_anonymous_buffer_size = 0;
3346			int ro_anonymous_buffer_size = 0;
3347			for (i = 0; i < s_alloc_prep->buffer_size; i++)
3348				if (s_alloc_prep->buffers[i].p_refs[0])
3349				{
3350					/* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3351					int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3352					/* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3353					int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3354					assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3354, __extension__ __PRETTY_FUNCTION__
); }));
3355					int unref_p_ref_0 = p_ref_0;
3356					while (tensor_blocks[unref_p_ref_0].ref)
3357						unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3358					/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3359					assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3359, __extension__ __PRETTY_FUNCTION__); }));
3360					if (s_alloc_prep->buffers[i].p_refs[1])
3361					{
3362						int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3363						const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3364						assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3364, __extension__ __PRETTY_FUNCTION__
); }));
3365						int unref_p_ref_1 = p_ref_1;
3366						while (tensor_blocks[unref_p_ref_1].ref)
3367							unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3368						/* See above comment for the similar p_ref_0 check. */
3369						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3369, __extension__ __PRETTY_FUNCTION__); }));
3370						assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3370, __extension__ __PRETTY_FUNCTION__
); }));
3371						int p_ref_t;
3372						if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3373						{
3374							CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
));
3375							CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t));
3376						}
3377						p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3378						/* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3379						if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
3380						{
3381							const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3382							if (folded)
3383							{
3384								p_ref_0 = p_ref_1;
3385								unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3386								tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3387								for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3388								{
3389									const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3390									assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3390, __extension__ __PRETTY_FUNCTION__
); }));
3391								}
3392							}
3393						}
3394					}
3395					/* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3396					 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3397					 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3398					 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3399					 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3400					 * associated with it, then we are good. */
3401					if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3402						(p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3403						(p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3404						TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3405					{
3406						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3407							{ assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3407, __extension__ __PRETTY_FUNCTION__
); })); }
3408						/* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3409						 * is a long argument why that is the case, the digest is, it is much easier to control your output
3410						 * than your input). */
3411						s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3412						s_alloc_prep->buffers[i].p_refs[1] = 0;
3413						/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3414						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3414, __extension__ __PRETTY_FUNCTION__); }));
3415						tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
 : _b; });
3416						for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3417							tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3418								tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3419									tensor_blocks[unref_p_ref_0].size;
3420					} else {
3421						s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3422						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3423							++ro_anonymous_buffer_size;
3424						else
3425							rw_anonymous_buffer_size += unroll_count + 1;
3426					}
3427				} else {
3428					if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3429						++ro_anonymous_buffer_size;
3430					else
3431						rw_anonymous_buffer_size += unroll_count + 1;
3432				}
3433			if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3434			{
3435				const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3436				// All read-write buffer (potentially) can be reused between each case..of branch.
3437				rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3438				// Read-only buffer cannot be reused between each case..of branch.
3439				ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3440				/* Anonymous block, allocate additional tensor blocks for this. */
3441				/* This is either because this is an internal tensor (don't have p_ref) */
3442				/* or it is an anonymous block itself within the sub graphs of this while graph. */
3443				tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3444				memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3445				if (dup_tensor_block_ref)
3446					dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3447				for (i = 0; i < s_alloc_prep->buffer_size; i++)
3448					if (!s_alloc_prep->buffers[i].p_refs[0])
3449					{
3450						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3451						{
3452							assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap"
, "ccv_nnc_symbolic_graph_compile.c", 3452, __extension__ __PRETTY_FUNCTION__
); }));
3453							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0x10) | ANONYMOUS));
3454							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3455							tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3456							tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3457							tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3458							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3459							tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3460							ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3461							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3462							if (dup_p_refs && dup_p_refs->rnum > 0)
3463							{
3464								for (j = 0; j < dup_p_refs->rnum; j++)
3465								{
3466									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3467									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3467, __extension__ __PRETTY_FUNCTION__
); }));
3468									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3468, __extension__ __PRETTY_FUNCTION__
); }));
3469									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3469, __extension__ __PRETTY_FUNCTION__); }));
3470									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3471									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3472									if (tensor_symbol_info[dup_p_ref].p_ref)
3473									{
3474										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3475										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3475, __extension__ __PRETTY_FUNCTION__); }));
3476										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3477										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3478										{
3479											if (!tensor_blocks[tensor_block_size].dup_p_refs)
3480												tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3481											ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3482										}
3483									}
3484									if (!tensor_blocks[tensor_block_size].tail)
3485										tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3486									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3487										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_size]);
3488								}
3489							} else {
3490								tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3491								ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3492							}
3493							for (j = 0; j < source_size; j++)
3494								_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3495							/* If this is a read-only (based on SSA, if first encountered as read), and this is
3496							 * sub-graph. Mark it to the end of the graph. */
3497							if (p_exec_symbol_info)
3498								for (j = 0; j < destination_size; j++)
3499									_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3500							/* If it is read-only, it is self-reflecting. */
3501							for (k = 0; k < unroll_count; k++)
3502							{
3503								for (j = 0; j < destination_size; j++)
3504									if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3505									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3506								/* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3507								assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
 ({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3507, __extension__ __PRETTY_FUNCTION__
); }));
3508								dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3509							}
3510							++tensor_block_size;
3511						} else {
3512							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3513							const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3514							const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3515							// Find suitable tensor block from the free list.
3516							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3517							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3518							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3519							if (new_anonymous_tensor_block)
3520							{
3521								tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3522								tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3523								tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3524								tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3525								ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3526							} else {
3527								tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3528								tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3529							}
3530							if (dup_p_refs && dup_p_refs->rnum > 0)
3531							{
3532								for (j = 0; j < dup_p_refs->rnum; j++)
3533								{
3534									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3535									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3535, __extension__ __PRETTY_FUNCTION__
); }));
3536									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3536, __extension__ __PRETTY_FUNCTION__
); }));
3537									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3538									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3539									if (tensor_symbol_info[dup_p_ref].p_ref)
3540									{
3541										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3542										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3542, __extension__ __PRETTY_FUNCTION__); }));
3543										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3544										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3545										{
3546											if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3547												tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3548											ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3549										}
3550									}
3551									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3551, __extension__ __PRETTY_FUNCTION__); }));
3552									if (!tensor_blocks[tensor_block_idx].tail)
3553										tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3554									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3555										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_idx]);
3556									// We have to add it to the warp around companion_ref as well.
3557									// TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3558									// be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3559									// definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3560									// gaurantee may be broken down in the line.
3561									if (tensor_blocks[dup_p_ref].companion_ref)
3562									{
3563										const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3564										for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3565											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3566										for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3567											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3568									}
3569								}
3570							} else if (new_anonymous_tensor_block) {
3571								tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3572								ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3573							}
3574							const int prev_tensor_block_idx = tensor_block_idx;
3575							if (new_anonymous_tensor_block)
3576							{
3577								if (!anonymous_block_free_list)
3578									anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3579								ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3580								++tensor_block_size;
3581							}
3582							for (k = 0; k < unroll_count; k++)
3583							{
3584								const int tensor_block_idx = new_anonymous_tensor_block ?
3585									(dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3586									dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3587								TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3588								TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3589								if (new_anonymous_tensor_block)
3590								{
3591									tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3592									tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3593									tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3594									tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3595									/* Attach to duplicated exec for this tensor block. */
3596									ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3597								} else {
3598									tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3599									tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3600									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3601 
3602								}
3603								if (dup_p_refs && dup_p_refs->rnum > 0)
3604								{
3605									/* Not nil, not self-reflecting. */
3606									for (j = 0; j < dup_p_refs->rnum; j++)
3607									{
3608										const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3609										assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3609, __extension__ __PRETTY_FUNCTION__
); }));
3610										assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3610, __extension__ __PRETTY_FUNCTION__
); }));
3611										// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3612										// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3613										if (tensor_symbol_info[dup_p_ref].p_ref)
3614										{
3615											const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3616											assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3616, __extension__ __PRETTY_FUNCTION__); }));
3617											const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3618											if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3619											{
3620												if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3621													tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3622												ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3623											}
3624										}
3625										assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
 ("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3625, __extension__ __PRETTY_FUNCTION__
); }));
3626										const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3627										assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
 __extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
 __assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3627, __extension__ __PRETTY_FUNCTION__); }));
3628										if (!tensor_blocks[tensor_block_idx].tail)
3629											tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3630										for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3631											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3632										// We have to add it to the warp around companion_ref as well.
3633										if (tensor_blocks[dup_dup_p_ref].companion_ref)
3634										{
3635											const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3636											for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3637												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3638											for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3639												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3640										}
3641									}
3642								} else if (new_anonymous_tensor_block) {
3643									tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3644									ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3645								}
3646								if (new_anonymous_tensor_block)
3647									++tensor_block_size;
3648							}
3649						}
3650					}
3651			}
3652		}
3653	} ccv_nnc_graph_visit_endfor} }
3654	if (anonymous_block_free_list)
3655		ccv_array_free(anonymous_block_free_list);
3656	ccfreefree(tensor_fold);
3657	// It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3658	// the allocation dependencies, thus, which tensor is reused to the existing tensor.
3659	ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3660	prep->while_count_tensor = 0;
3661	prep->dup_breakpoints = 0;
3662	prep->p = 0;
3663	prep->symbolic_graph = symbolic_graph;
3664	prep->p_idx = symbolic_graph->p_idx;
3665	prep->exec_idx = symbolic_graph->exec_idx;
3666	prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3667	prep->sub_preps = sub_preps;
3668	prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3669	prep->exec_symbol_info = exec_symbol_info;
3670	prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3671	prep->tensor_symbol_info = tensor_symbol_info;
3672	prep->unroll_count = unroll_count;
3673	prep->dup_tensor_block_ref = dup_tensor_block_ref;
3674	prep->tensor_block_size = tensor_block_size;
3675	prep->tensor_blocks = tensor_blocks;
3676	prep->exec_flags = exec_flags;
3677	prep->visit = visit;
3678	prep->alloc_prep = alloc_prep;
3679	if (dup_graph)
3680		ccv_nnc_symbolic_graph_free(dup_graph);
3681	if (dup_exec_ref)
3682		ccfreefree(dup_exec_ref);
3683	return prep;
3684}
3685 
3686static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3687{
3688	int i;
3689	_ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3690	ccfreefree(prep->exec_flags);
3691	for (i = 0; i < prep->sub_prep_size; i++)
3692		if (prep->sub_preps[i])
3693			_ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3694	if (prep->sub_preps)
3695		ccfreefree(prep->sub_preps);
3696	ccfreefree(prep->tensor_symbol_info);
3697	ccfreefree(prep->exec_symbol_info);
3698	if (prep->dup_tensor_block_ref)
3699		ccfreefree(prep->dup_tensor_block_ref);
3700	_ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3701	ccv_nnc_graph_visit_free(prep->visit);
3702	ccfreefree(prep);
3703}
3704 
3705static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3706{
3707	int i, j;
3708	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
3709		if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3710		{
3711			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3712			assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3712, __extension__ __PRETTY_FUNCTION__
); }));
3713			ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3714			for (i = 0; i < node->p_while.input_size; i++)
3715				if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3716				{
3717					ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3718					const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3719					for (j = 0; j < d; j++)
3720						prep = prep->p;
3721					prep->while_count_tensor = 1;
3722				}
3723		}
3724		for (i = 0; i < node->graph_ref_size; i++)
3725		{
3726			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3727			if (graph_ref >= 0)
3728				_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3729		}
3730	} ccv_nnc_graph_visit_endfor} }
3731}
3732 
3733static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3734{
3735	if (symbol >= 0)
3736		return graph_prep->tensor_arena->vt_tensors[symbol];
3737	if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3738		return 0;
3739	assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
 : 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
 0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3739, __extension__ __PRETTY_FUNCTION__
); }));
3740	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3741	int i;
3742	const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3743	for (i = 0; i < d; i++)
3744		prep = prep->p;
3745	assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
 ({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3745, __extension__ __PRETTY_FUNCTION__
); }));
3746	return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3747}
3748 
3749static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3750{
3751	int i;
3752	int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3753	ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3754	graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3755	graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3756	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3757	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3758		if (graph_execs[i].graph == graph)
3759			graph_execs[i].d = exec_cvt[graph_execs[i].d];
3760	ccfreefree(exec_cvt);
3761}
3762 
3763static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3764{
3765	int i, j, k;
3766	ccv_nnc_graph_t* const graph = graph_prep->graph;
3767	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3768	ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
1
Uninitialized value stored to field 'graph'→
3769	graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3770	graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3771	graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3772	graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3773	memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3774	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3775	int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3776	for (i = 0; i < exec_symbol_info_size; i++)
2
←
Assuming 'i' is >= 'exec_symbol_info_size'→
3
←
Loop condition is false. Execution continues on line 3785→
3777	{
3778		max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; });
3779		max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; });
3780		if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3781			max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
 : _b; });
3782		graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3783		graph_execs[i].graph = 0;
3784	}
3785	for (i = 0; i < graph_prep->sub_prep_size; i++)
4
←
Assuming 'i' is >= field 'sub_prep_size'→
5
←
Loop condition is false. Execution continues on line 3787→
3786		max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
 ((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; });
3787	ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
6
←
'?' condition is true→
3788	ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
7
←
'?' condition is true→
3789	ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })];
8
←
'?' condition is true→
3790	const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3791	const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3792	// Create node, this is in topological order.
3793	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
9
←
Assuming '_i_' is < field 'size'→
10
←
Loop condition is true.  Entering loop body→
3794		if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
11
←
The left operand of '==' is a garbage value
3795		{
3796			for (i = 0; i < node->input_size; i++)
3797				max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3798			for (i = 0; i < node->output_size; i++)
3799				max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3800			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3801			{
3802				const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3803				assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3803, __extension__ __PRETTY_FUNCTION__
); }));
3804				ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3805				ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3806				graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3807				const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3808				ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3809				ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3810				for (i = 0; i < node->p_while.input_size; i++)
3811					max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3812				for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3813					max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3814				ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3815				_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3816			} else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3817				for (i = 0; i < node->output_size; i++)
3818					if (max_outputs[i] && max_outputs[i]->alias_ref)
3819						max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3820				graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3821				// Check whether this is already covered in the inputs, if not, need to be covered in the update.
3822				for (i = 0; i < node->case_of.argument.offset; i++)
3823				{
3824					ccv_nnc_tensor_t* const update = max_inputs[i];
3825					if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3826						continue;
3827					int flag = 0;
3828					for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3829						flag = (update == max_inputs[j]);
3830					if (!flag)
3831						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3832				}
3833				const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3834				ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3835				if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3836				{
3837					// Add another graph for data transfer.
3838					ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3839					for (i = 0; i < node->output_size; i++)
3840						max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3841					ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }), max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }));
3842					ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3843					ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3844					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3845					int exec_cvt;
3846					ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3847				}
3848				for (i = 0; i < node->graph_ref_size; i++)
3849				{
3850					const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3851					if (graph_ref < 0)
3852						continue;
3853					ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3854					const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3855					ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3856					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3857					_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3858				}
3859			} else {
3860				graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3861			}
3862			ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3863		}
3864	} ccv_nnc_graph_visit_endfor} }
3865	// Then connect them.
3866	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
3867		if (node->outgoings)
3868			for (i = 0; i < node->outgoings->rnum; i++)
3869			{
3870				const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
3871				if (graph_execs[outgoing].graph)
3872					ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3873			}
3874	} ccv_nnc_graph_visit_endfor} }
3875	int source_exec_created = 0;
3876	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3877	const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3878	ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3879	// After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3880	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
3881	{
3882		if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
3883		{
3884			int ref = i;
3885			while (tensor_symbol_info[ref].alias_ref)
3886				ref = tensor_symbol_info[ref].alias_ref - 1;
3887			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
3888				ref = tensor_blocks[ref].ref - 1;
3889			// This is not computable. It could be that we marked a const tensor as init zero.
3890			if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)))
3891				continue;
3892			// If this tensor is not used by any exec, we don't need to init at all. Skip.
3893			if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3894				continue;
3895			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3896			// Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3897			ccv_nnc_graph_exec_t set_exec;
3898			if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3899				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={0,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3900			else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3901				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={1,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3902			for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
3903			{
3904				const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)));
3905				if (outgoing >= exec_symbol_info_size)
3906					continue;
3907				assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
 if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3907, __extension__ __PRETTY_FUNCTION__
); }));
3908				assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3908, __extension__ __PRETTY_FUNCTION__
); }));
3909				ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3910			}
3911			int flags = 0;
3912			if (alloc_dep[ref])
3913				for (j = 0; j < alloc_dep[ref]->rnum; j++)
3914				{
3915					const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)));
3916					// This is from alloc_dep, it should be computable.
3917					assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3917, __extension__ __PRETTY_FUNCTION__
); }));
3918					if (tensor_blocks[d].tail)
3919						for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
3920						{
3921							const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
3922							if (incoming >= exec_symbol_info_size)
3923								continue;
3924							assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
 if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3924, __extension__ __PRETTY_FUNCTION__
); }));
3925							assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3925, __extension__ __PRETTY_FUNCTION__
); }));
3926							ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3927							flags = 1;
3928						}
3929				}
3930			// If cannot find a start node for this exec, we need to append it to the no-op of the start.
3931			if (!flags)
3932			{
3933				if (!source_exec_created)
3934				{
3935					graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3936					source_exec_created = 1;
3937				}
3938				ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3939			}
3940		}
3941	}
3942	// Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3943	// (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3944	// with its alias).
3945	assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3945, __extension__ __PRETTY_FUNCTION__
); }));
3946	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3947	{
3948		ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3949		// If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3950		if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3951		{
3952			const ccv_array_t* const head = tensor_blocks[i].head;
3953			if (head && head->rnum > 0)
3954				for (j = 0; j < head->rnum; j++)
3955				{
3956					const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(j)));
3957					if (idx >= exec_symbol_info_size)
3958						continue;
3959					assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3959, __extension__ __PRETTY_FUNCTION__); }));
3960					const int d = graph_execs[idx].d;
3961					ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)));
3962					int flag = 0;
3963					if (exec_info->tensor_wraps_ref)
3964					{
3965						ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)));
3966						for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3967							flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3968					}
3969					// If none is in the flag, it need to be included in the cast.
3970					if (!flag)
3971						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3972				}
3973		}
3974	}
3975	// Create source / destination phony node. This is to facilitate use of compiled graph.
3976	// Also, this is needed if you have init zero execs.
3977	if (source_exec_created || source_size > 1)
3978	{
3979		if (!source_exec_created)
3980			graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3981		for (i = 0; i < source_size; i++)
3982			ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3983	} else {
3984		assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
 ({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3984, __extension__ __PRETTY_FUNCTION__
); }));
3985		assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
 if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3985, __extension__ __PRETTY_FUNCTION__
); }));
3986		graph_exec_arena->source = graph_execs[sources[0].d];
3987	}
3988	if (destination_size == 1)
3989		graph_exec_arena->destination = graph_execs[destinations[0].d];
3990	else {
3991		graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3992		for (i = 0; i < destination_size; i++)
3993			ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3994	}
3995	ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3996	ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3997	return graph_exec_arena;
3998}
3999 
4000static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
4001{
4002	if (graph_prep->symbolic_graph == pair)
4003		return graph_prep->graph;
4004	int i;
4005	for (i = 0; i < graph_prep->sub_prep_size; i++)
4006		if (graph_prep->sub_preps[i])
4007		{
4008			ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
4009			if (graph)
4010				return graph;
4011		}
4012	return 0;
4013}
4014 
4015static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4016{
4017	int i;
4018	for (i = 0; i < graph_prep->sub_prep_size; i++)
4019		if (graph_prep->sub_preps[i])
4020		{
4021			if (graph_prep->sub_preps[i]->symbolic_graph->pair)
4022				graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
4023		}
4024}
4025 
4026static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4027{
4028	assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4028, __extension__ __PRETTY_FUNCTION__
); }));
4029	int i;
4030	for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
4031	{
4032		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
))
4033			continue;
4034		if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref)
4035		{
4036			ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4037				.d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4038				.graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph,
4039			});
4040			if (pair_exec.d >= 0)
4041				ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4042		}
4043	}
4044	for (i = 0; i < graph_prep->sub_prep_size; i++)
4045		if (graph_prep->sub_preps[i])
4046			_ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4047}
4048 
4049static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4050{
4051	int i;
4052	if (graph_prep->dup_breakpoints)
4053	{
4054		// Strip the const modifier only possible because it is a sub-graph.
4055		ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4056		for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
4057			ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
 + (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i))));
4058		ccv_array_free(graph_prep->dup_breakpoints);
4059		graph_prep->dup_breakpoints = 0;
4060		graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4061		// Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4062		memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4063		// Since exec_symbol_info changed, create a new visit object.
4064		assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 4064, __extension__ __PRETTY_FUNCTION__
); }));
4065		assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 4065, __extension__ __PRETTY_FUNCTION__); }));
4066		ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)));
4067		const int source_size = symbolic_graph->sources->rnum;
4068		ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)));
4069		const int destination_size = symbolic_graph->destinations->rnum;
4070		ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4070, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4070, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4070, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
4071		ccv_nnc_graph_visit_free(graph_prep->visit);
4072		graph_prep->visit = visit;
4073		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 4073, __extension__ __PRETTY_FUNCTION__
); }));
4074		ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4075	}
4076	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
4077		for (i = 0; i < node->graph_ref_size; i++)
4078		{
4079			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
4080			if (graph_ref >= 0)
4081				_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4082		}
4083	} ccv_nnc_graph_visit_endfor} }
4084}
4085 
4086const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4087 
4088void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4089{
4090	assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 4090, __extension__ __PRETTY_FUNCTION__); }));
4091	assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
 if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4091, __extension__ __PRETTY_FUNCTION__
); }));
4092	assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
 ({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4092, __extension__ __PRETTY_FUNCTION__
); }));
4093	int i;
4094	// Cannot bind the multi-view.
4095	for (i = 0; i < tensor_bind_size; i++)
4096	{
4097		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 4097, __extension__ __PRETTY_FUNCTION__
); }));
4098		assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 4098, __extension__ __PRETTY_FUNCTION__
); }));
4099	}
4100	ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4101	_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4102	ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4103	_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4104	*tensor_arena_ref = tensor_arena;
4105	// The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4106	_ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4107	// Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4108	_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4109	*graph_ref = graph_prep->graph;
4110	ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4111	_ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4112	_ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4113	*graph_exec_arena_ref = graph_exec_arena;
4114	_ccv_nnc_symbolic_graph_prep_free(graph_prep);
4115}
4116 
4117static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4118{
4119	// Buffers are inherited from above, no need to dealloc.
4120	int i;
4121	for (i = 0; i < tensor_arena->sub_arena_size; i++)
4122		if (tensor_arena->sub_arenas[i])
4123			_ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4124	for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
4125	{
4126		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
 (size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i))));
4127		assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 4127, __extension__ __PRETTY_FUNCTION__
); }));
4128		ccv_nnc_tensor_multiview_free(*mv);
4129	}
4130	ccv_array_free(tensor_arena->tensor_metadata);
4131	ccv_array_free(tensor_arena->m_tensor_idx);
4132	if (tensor_arena->pb_vt_tensors)
4133		ccfreefree(tensor_arena->pb_vt_tensors);
4134	if (tensor_arena->vt_alias_r_refs_p)
4135		ccfreefree(tensor_arena->vt_alias_r_refs_p);
4136	if (tensor_arena->vt_sizes)
4137		ccfreefree(tensor_arena->vt_sizes);
4138	ccfreefree(tensor_arena);
4139}
4140 
4141void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4142{
4143	assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
 == (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 4143, __extension__ __PRETTY_FUNCTION__
); }));
4144	assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4144, __extension__ __PRETTY_FUNCTION__
); }));
4145	assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({
 if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 4145, __extension__ __PRETTY_FUNCTION__
); }));
4146	// Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4147	int i;
4148	if (!tensor_arena->pb_vt_tensors)
4149	{
4150		tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4151		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4152			if (tensor_arena->vt_tensors[i])
4153				tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4154	}
4155	if (!tensor_arena->vt_alias_r_refs_p)
4156	{
4157		tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4158		tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4159		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4160			if (tensor_arena->vt_alias_refs[i])
4161			{
4162				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4163				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4163, __extension__ __PRETTY_FUNCTION__
); }));
4164				++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4165			}
4166		int refp = 0;
4167		for (i = 0; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4168			if (tensor_arena->vt_alias_r_refs_p[i])
4169				refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4170			else
4171				tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4172		for (i = refp; i < tensor_arena->vt_tensor_size; i++)
4173			tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4174		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4175			if (tensor_arena->vt_alias_refs[i])
4176			{
4177				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4178				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4178, __extension__ __PRETTY_FUNCTION__
); }));
4179				const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4180				assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if (
pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 4180, __extension__ __PRETTY_FUNCTION__); }));
4181				tensor_arena->vt_alias_r_refs[pos] = i;
4182			}
4183	}
4184	const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d;
4185	if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4186	{
4187		assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor
)->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4187, __extension__ __PRETTY_FUNCTION__
); })); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4188		assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4190, __extension__ __PRETTY_FUNCTION__
); }))
4189					ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4190, __extension__ __PRETTY_FUNCTION__
); }))
4190				(size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4190, __extension__ __PRETTY_FUNCTION__
); }));
4191	} else
4192		{ assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4192, __extension__ __PRETTY_FUNCTION__
); })); }
4193	if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW
))
4194		{ assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors
[symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t
*)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else
 __assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4194, __extension__ __PRETTY_FUNCTION__
); })); }
4195	tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4196	if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4197		for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++)
4198		{
4199			const int d = tensor_arena->vt_alias_r_refs[i];
4200			if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it.
4201				break;
4202			ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4203			d_tensor->info.datatype = tensor->info.datatype;
4204			d_tensor->info.reserved = tensor->info.reserved;
4205			if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW))
4206				ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4207			else {
4208				d_tensor->data.u8 = tensor->data.u8;
4209				d_tensor->dataof = tensor->dataof;
4210			}
4211		}
4212}
4213 
4214void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4215{
4216	if (!tensor_arena->pb_vt_tensors)
4217		return;
4218	int i;
4219	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4220		if (tensor_arena->vt_tensors[i])
4221			tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4222}
4223 
4224uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4225{
4226	uint64_t total_size = 0;
4227	int i;
4228	for (i = 0; i < tensor_arena->buffer_size; i++)
4229		total_size += tensor_arena->buffers[i].size;
4230	return total_size;
4231}
4232 
4233static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4234{
4235	int i;
4236	if (mv->it)
4237		mv->it->info = params;
4238	for (i = 0; i < mv->repeat + mv->kind; i++)
4239	{
4240		ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
4241		if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4242			_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4243		else
4244			tensor->info = params;
4245	}
4246}
4247 
4248int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4249{
4250	int i;
4251	assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph->
tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size
) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4251, __extension__ __PRETTY_FUNCTION__
); }));
4252	if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4253	{
4254		tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4255		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4256			if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4257			{
4258				ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4259				if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4260				{
4261					ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4262					while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
4263						mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
4264					tensor = (ccv_nnc_tensor_t*)mv;
4265				}
4266				tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4267			}
4268	}
4269	int flag = 0;
4270	for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++)
4271		if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4272		{
4273			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4274			ccv_nnc_tensor_param_t params = symbol_info->info;
4275			params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4276			params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4277			flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4278		}
4279	if (flag)
4280		return -1;
4281	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4282		if (tensor_arena->vt_tensors[i])
4283		{
4284			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4285			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4286			if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4287			{
4288				assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0)
, __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else
 __assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c"
, 4288, __extension__ __PRETTY_FUNCTION__); }));
4289				_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4290			} else if (!tensor_arena->vt_alias_refs[i]) {
4291				ccv_nnc_tensor_param_t params = symbol_info->info;
4292				params.datatype = tensor->info.datatype;
4293				params.reserved = tensor->info.reserved;
4294				tensor->info = params;
4295			} else {
4296				off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4297				ccv_nnc_tensor_param_t params = symbol_info->info;
4298				params.datatype = tensor->info.datatype;
4299				params.reserved = tensor->info.reserved;
4300				tensor->info = params;
4301				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4302				ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4303				if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4304				{
4305					((ccv_nnc_tensor_view_t*)tensor)->off = off;
4306					memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4307				}
4308			}
4309		}
4310	// Should handle sub_tensor_arena, don't do that at the moment.
4311	assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs"
, "ccv_nnc_symbolic_graph_compile.c", 4311, __extension__ __PRETTY_FUNCTION__
); }));
4312	return 0;
4313}
4314 
4315void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4316{
4317	assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum
 >= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__
 ({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena
->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 4317, __extension__ __PRETTY_FUNCTION__
); }));
4318	int i;
4319	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
4320	{
4321		const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4322		if (graph_exec.d < 0)
4323			continue;
4324		const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4325		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
4326		ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4327		if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4328		{
4329			new_cmd.backend = existing_cmd.backend;
4330			new_cmd.algorithm = existing_cmd.algorithm;
4331		}
4332		ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4333	}
4334}
4335 
4336void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4337{
4338	int i;
4339	for (i = 0; i < tensor_arena->buffer_size; i++)
4340	{
4341		if (!tensor_arena->buffers[i].ptr)
4342			continue;
4343		const int buffer_type = tensor_arena->buffers[i].type;;
4344		const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
4345#ifdef HAVE_CUDA1
4346		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4347		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4348		{
4349			if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4350				tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4351			else
4352				cufree(device_id, tensor_arena->buffers[i].ptr);
4353		} else {
4354			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4354, __extension__ __PRETTY_FUNCTION__
); }));
4355			if (tensor_arena->buffers[i].pin_mem)
4356				cuhostfree(tensor_arena->buffers[i].ptr);
4357			else
4358				ccfreefree(tensor_arena->buffers[i].ptr);
4359		}
4360#elif defined(HAVE_MPS)
4361		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4362		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4363		{
4364			// if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4365			// 	tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4366			// else
4367			mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4368		} else {
4369			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4369, __extension__ __PRETTY_FUNCTION__
); }));
4370			ccfreefree(tensor_arena->buffers[i].ptr);
4371		}
4372#else
4373		assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4373, __extension__ __PRETTY_FUNCTION__
); }));
4374		ccfreefree(tensor_arena->buffers[i].ptr);
4375#endif
4376		tensor_arena->buffers[i].ptr = 0;
4377	}
4378	// For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4379	if (tensor_arena->disposers)
4380	{
4381		for (i = 0; i < tensor_arena->disposers->rnum; i++)
4382		{
4383			ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + (
size_t)(tensor_arena->disposers)->rsize * (size_t)(i)));
4384			disposer->dispose(disposer->ptr, disposer->userdata);
4385		}
4386		ccv_array_free(tensor_arena->disposers);
4387		tensor_arena->disposers = 0;
4388	}
4389}
4390 
4391void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4392{
4393	ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4394	_ccv_nnc_tensor_arena_free(tensor_arena);
4395}
4396 
4397void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4398{
4399	int i;
4400	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
4401		if (graph_exec_arena->sub_arenas[i])
4402			ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4403	ccfreefree(graph_exec_arena);
4404}