ccv_nnc_symbolic_graph

Bug Summary

File:	nnc/ccv_nnc_symbolic_graph_compile.c
Warning:	line 3759, column 7 The left operand of '==' is a garbage value
Annotated Source Code

Press '?' to see keyboard shortcuts
Show analyzer invocation
clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2024-11-11-172035-379299-1 -x c ccv_nnc_symbolic_graph_compile.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "ccv_nnc_easy.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include "_ccv_nnc_graph.h"
11#include "_ccv_nnc_symbolic_graph.h"
12 
13// MARK - Level-3 API
14 
15typedef struct {
16	int flags;
17	int type;
18	int pin_mem; // This memory need to be pinned.
19	int ref; // Reference to another tensor block. Start with 1.
20	int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21	int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22	int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23	int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24	ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25	uint64_t size; // The size of the tensor expected.
26	int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27	ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28	ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29	ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30} ccv_nnc_tensor_block_t; // Tensor Arena Block
31 
32#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
33 
34enum {
35	UNASSIGNED = 0x1,
36	ALIAS = 0x2,
37	READ_ONLY = 0x4,
38	WRITE_ONLY = 0x8,
39	READ_WRITE = 0xc,
40	ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41	UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42	UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43};
44 
45#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
46#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
47#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
48#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
50#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
51#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED)) (!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
52#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
53#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
54#define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
56#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
58#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
60 
61#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
 & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)) (((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
62 
63// Holds additional information about the exe nodes.
64typedef struct {
65	int flags;
66} ccv_nnc_graph_exec_flag_t;
67 
68enum {
69	CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70};
71 
72typedef struct {
73	int index;
74	int oc;
75	int type;
76	uint64_t size;
77} ccv_nnc_tensor_opt_t;
78 
79// We first sort the same type together (because they won't be reused at all.
80// And then we sort by size, after that, sort by oc.
81#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
 *array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
 t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
83#undef more_than
84typedef struct {
85	int idx;
86	int hop;
87} ccv_nnc_tensor_hop_t;
88#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t
 total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t
 t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t
 *ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t
* left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t
* ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh )
 { insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
 { for( ptr2 = ptr; ptr2 > left && less_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
 ((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t
* left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0
; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t
* a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt
 = 0; left0 = left; right0 = right; pivot = left + (n/2); if(
 n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
 + 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux
) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b
, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? (
less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a)
) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a
 : c)); a = right - 2*d, b = right - d, c = right; right = less_than
(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, *
c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
 = less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than
(*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !less_than(*pivot, *left
, aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left >
 left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
 right && !less_than(*right, *pivot, aux) ) { if( !less_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
 = 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
 swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
 left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
 _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
 ((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
 ((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
 _b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
 1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
 stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
 } else left = left0, right = left0 + n - 1; } else if( m >
 1 ) left = right0 - m + 1, right = right0; else break; } } }
 }
90#undef less_than
91 
92// If b has items overlap with a, a is still after b (inclusive).
93static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94{
95	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__
); }));
96	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__
); }));
97	int x, y;
98	for (x = 0; x < b->rnum; x++)
99	{
100		const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)));
101		int flag = 0;
102		// In extreme cases where a is a superset of b, then a is still after b, we are good.
103		for (y = 0; !flag && y < a->rnum; y++)
104		{
105			const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)));
106			flag = (p == q);
107		}
108		if (!flag)
109			for (y = 0; y < a->rnum; y++)
110			{
111				ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y))), p);
112				if (!cell.i32 || cell.i32[0] == 0)
113					return 0;
114			}
115	}
116	// If b->rnum == 0, a is after b for sure.
117	// Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118	// if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119	return (a->rnum > 0 || b->rnum == 0);
120}
121 
122static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123{
124	assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
 ("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__
 __PRETTY_FUNCTION__); }));
125	assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
 ("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__
 __PRETTY_FUNCTION__); }));
126	int x, y, max_hop = 0;
127	for (x = 0; x < a->rnum; x++)
128		for (y = 0; y < b->rnum; y++)
129		{
130			ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x))), *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y))));
131			if (!cell.i32 || cell.i32[0] == 0)
132				return 0;
133			max_hop = ccv_max(cell.i32[0], max_hop)({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (max_hop) _b
 = (max_hop); (_a > _b) ? _a : _b; });
134		}
135	// We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
136	// The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
137	return max_hop;
138}
139 
140// If every a's head is deterministically after b's tail
141static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
142{
143	return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
144}
145 
146typedef struct {
147	ccv_array_t** alloc_dep;
148	int vt_block_size;
149	int buffer_size;
150	int block_size;
151	int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
152	struct {
153		int type; // The type from tensor blocks.
154		int pin_mem; // Whether this is pinned memory.
155		int flags; // The flags (currently for READ_ONLY or not).
156		uint64_t size; // The size of the buffer allocated.
157		int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
158		ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
159	}* buffers;
160	struct {
161		int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
162		int block_ref; // A reference to which block in the given tensor_block to use.
163		uint64_t offset; // The offset of this block.
164	}* blocks;
165} ccv_nnc_tensor_alloc_prep_t;
166 
167typedef struct ccv_nnc_symbolic_graph_prep_s {
168	int flags;
169	int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
170	int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
171	int exec_idx;
172	int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
173	int tensor_symbol_info_size;
174	int exec_symbol_info_size;
175	int tensor_block_size;
176	int sub_prep_size;
177	ccv_nnc_tensor_block_t* tensor_blocks;
178	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
179	ccv_nnc_graph_exec_flag_t* exec_flags;
180	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
181	int* dup_tensor_block_ref;
182	ccv_nnc_graph_visit_t* visit;
183	ccv_nnc_tensor_alloc_prep_t* alloc_prep;
184	struct ccv_nnc_symbolic_graph_prep_s* p;
185	struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
186	// Structures that don't require to be freed after deallocation.
187	const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
188	ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
189	ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
190	ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
191} ccv_nnc_symbolic_graph_prep_t;
192 
193typedef struct {
194	int oc;
195	ccv_array_t* itf;
196} ccv_nnc_tensor_block_adjacent_t;
197 
198static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
199{
200	// Compute how many dis-continuous buffers are needed.
201	// We prefer to have several dis-continuous buffers instead of one big buffer because
202	// in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
203	// to fully utilize memory.
204	int i, j, k;
205	ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
206	int allocable_tensor_size = 0, available_tensor_size = 0;
207	for (i = 0; i < tensor_block_size; i++)
208		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
209		{
210			// Tensors that we need the header info.
211			++available_tensor_size;
212			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
213				// Tensors that we actually need to allocate (exclude the alias).
214				++allocable_tensor_size;
215		}
216	ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
217	ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
218	ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
219	// Overlap count.
220	for (i = 0; i < tensor_block_size; i++)
221		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
222			for (j = i + 1; j < tensor_block_size; j++)
223				if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED)))
224				{
225					// Check to see if they interfere (default to yes).
226					// If any of the i's head is deterministically later than j's tail
227					// or any of the i's tail is deterministically earlier than j's head, they don't interfere.
228					const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
229					if (i_hop_j > 0)
230					{
231						ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
232						ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
233					}
234					const int j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
235					if (j_hop_i > 0)
236					{
237						ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
238						ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
239					}
240					// It cannot be that both i can hop to j can j can hop to i.
241					assert(!(i_hop_j > 0 && j_hop_i > 0))((void) sizeof ((!(i_hop_j > 0 && j_hop_i > 0))
 ? 1 : 0), __extension__ ({ if (!(i_hop_j > 0 && j_hop_i
 > 0)) ; else __assert_fail ("!(i_hop_j > 0 && j_hop_i > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 241, __extension__ __PRETTY_FUNCTION__
); }));
242					if (!i_hop_j && !j_hop_i && tensor_blocks[i].type == tensor_blocks[j].type)
243					{
244						if (!adj[i].itf)
245							adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
246						ccv_array_push(adj[i].itf, &j);
247						++adj[i].oc;
248						if (!adj[j].itf)
249							adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
250						ccv_array_push(adj[j].itf, &i);
251						++adj[j].oc;
252					}
253				}
254	const int exec_dep_rows = exec_dep->rows;
255	ccv_matrix_free(exec_dep);
256	ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
257	int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
258	uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
259	uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
260	uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
261	int num_assigned = 0; 
262	// I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
263	// Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
264	// The first channel denotes the bytes available for allocation,
265	// the second channel denotes the offset available for the allocation,
266	ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
267	ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
268	for (j = 0; j < allocable_tensor_size;)
269	{
270		// Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
271		uint64_t max_size = 0;
272		ccv_array_clear(opt);
273		int current_type = 0; // Deal with one type at a time.
274		for (i = 0; i < tensor_block_size; i++)
275			if (tensor_blocks[i].size >= max_size &&
276				TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && !assigned[i] &&
277				IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
278				(!current_type || tensor_blocks[i].type == current_type))
279			{
280				ccv_nnc_tensor_opt_t a = {
281					.size = tensor_blocks[i].size,
282					.index = i,
283					.oc = adj[i].oc,
284					.type = tensor_blocks[i].type,
285				};
286				assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 286, __extension__ __PRETTY_FUNCTION__); }));
287				current_type = a.type; // Now we now the primary type we should deal with.
288				if (tensor_blocks[i].companion_ref)
289				{
290					const int companion_ref = tensor_blocks[i].companion_ref - 1;
291					a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; });
292					a.oc += adj[companion_ref].oc;
293				}
294				// In case we have a tie, take them all in the array.
295				if (a.size > max_size)
296					ccv_array_clear(opt), max_size = a.size;
297				ccv_array_push(opt, &a);
298			}
299		assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
 ({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 299, __extension__ __PRETTY_FUNCTION__
); }));
300		// Order opt array by the oc because type and size should be equal at this point.
301		_ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
302		// Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
303		int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
304		uint64_t min_val[2] = {
305			0, 0
306		};
307		if (j > 0)
308		{
309			for (i = 0; i < opt->rnum; i++)
310			{
311				ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(i)));
312				if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
313					continue;
314				// Now, determine the order between a and c. After this, we can always check whether y
315				// can hop to the earliest one and if the latest one can hop to x.
316				// The earliest one will be called p and the latest one will be called q.
317				int p = a.index;
318				int q = a.index;
319				if (tensor_blocks[a.index].companion_ref)
320				{
321					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
322					if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
323						continue;
324					const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
325					if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
326						p = companion_ref;
327					else {
328						const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
329						if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
330							q = companion_ref;
331						else { // Otherwise, b is in between p and q.
332							const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
333							const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
334							assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
 && b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
 : 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
 0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 334, __extension__ __PRETTY_FUNCTION__
); }));
335						}
336					}
337				}
338				assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 338, __extension__ __PRETTY_FUNCTION__
); }));
339				const int type = tensor_blocks[p].type;
340				// y is always earlier than x, but this is hard to assert now.
341				// If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
342				// Thus, the hop between y and x (through a) should be smallest ones.
343				// We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
344				// out of q. For these nodes, we try to verify whether they form a connection (by checking against
345				// alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
346				int y_size = 0;
347				ccv_nnc_tensor_hop_t* const y_buf = buf;
348#define for_block(y, val) do { \
349					if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \
350						y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
351							.idx = y + 1, .hop = ((int*)val)[0] \
352						}; \
353				} while(0)
354				ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
355				if (y_vector)
356					CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
357#undef for_block
358				assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
 ({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 358, __extension__ __PRETTY_FUNCTION__); }));
359				int x_size = 0;
360				ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
361#define for_block(x, val) do { \
362					if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \
363						x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
364							.idx = x + 1, .hop = ((int*)val)[0] \
365						}; \
366				} while(0)
367				ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
368				if (x_vector)
369					CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
370#undef for_block
371				assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
 : 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 371, __extension__ __PRETTY_FUNCTION__
); }));
372				int x, y;
373				_ccv_nnc_sort_by_hops(y_buf, y_size, 0);
374				for (y = 0; y < y_size; y++)
375				{
376					const int hop = exec_dep_rows + y_buf[y].hop;
377					if (hop >= min_hop)
378						break;
379					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
380					if (val.u64 && val.u64[0] >= a.size)
381					{
382						min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
383							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
384						break;
385					}
386				}
387				_ccv_nnc_sort_by_hops(x_buf, x_size, 0);
388				for (x = 0; x < x_size; x++)
389				{
390					const int hop = exec_dep_rows + x_buf[x].hop;
391					if (hop >= min_hop)
392						break;
393					const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
394					if (val.u64 && val.u64[0] >= a.size)
395					{
396						min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
397							min_val[0] = val.u64[0], min_val[1] = val.u64[1];
398						break;
399					}
400				}
401				const int x_min_hop = x_buf[0].hop;
402				for (y = 0; y < y_size; y++)
403				{
404					const int y_hop_p_v = y_buf[y].hop;
405					if (y_hop_p_v + x_min_hop >= min_hop)
406						break;
407					ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
408					if (y_vector)
409					{
410						for (x = 0; x < x_size; x++)
411						{
412							const int q_hop_x_v = x_buf[x].hop;
413							const int hop = y_hop_p_v + q_hop_x_v;
414							if (hop >= min_hop)
415								break;
416							const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
417							if (val.u64 && val.u64[0] >= a.size)
418							{
419								min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
420									min_val[0] = val.u64[0], min_val[1] = val.u64[1];
421								break;
422							}
423						}
424					}
425				}
426				// If I found a place, stop, and exit.
427				if (min_y > 0 || min_x < tensor_block_size + 1)
428				{
429					min_i = i;
430					break;
431				}
432				// There is no space to insert this block, mark it as such.
433				tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
434				if (tensor_blocks[a.index].companion_ref)
435				{
436					const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
437					tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
438				}
439			}
440		}
441		// If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
442		// and default to largest size available.
443		ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
 (size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))));
444		if (min_i == -1)
445		{
446			allocated_size[num_assigned] = a.size;
447			++num_assigned;
448		}
449		int assign_group = num_assigned;
450		if (min_y > 0)
451		{
452			assign_group = assigned[min_y - 1];
453			// The y and x should belong to the same assigned group.
454			assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
 - 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
 tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 454, __extension__ __PRETTY_FUNCTION__
); }));
455		} else if (min_x < tensor_block_size + 1)
456			assign_group = assigned[min_x - 1];
457		// If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
458		if (min_y != 0 || min_x != tensor_block_size + 1)
459		{
460			uint64_t val[2] = {
461				min_val[0], min_val[1]
462			};
463			assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
 ({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 463, __extension__ __PRETTY_FUNCTION__
); }));
464			val[0] -= a.size;
465			val[1] = val[1] + a.size; // Move the offset to the next one.
466			ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
467		}
468		int strings[3];
469		strings[0] = a.index + 1;
470		int string_size = 1;
471		// Assign out designated companion if it exist.
472		if (tensor_blocks[a.index].companion_ref)
473		{
474			const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
475			assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
 ("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 475, __extension__ __PRETTY_FUNCTION__
); }));
476			const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
477			if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
478			{
479				for (i = 0; i < string_size; i++)
480					strings[i + 1] = strings[i];
481				strings[0] = companion_ref + 1;
482			} else {
483				const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
484				if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
485					strings[string_size] = companion_ref + 1;
486				else {
487					// Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
488					assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
 if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 488, __extension__ __PRETTY_FUNCTION__
); }));
489					strings[2] = strings[1];
490					strings[1] = companion_ref + 1;
491				}
492			}
493			++string_size;
494		}
495		// Assign out and update oc.
496		for (i = 0; i < string_size; i++)
497		{
498			const int index = strings[i] - 1;
499			// Assign out the selected one.
500			assigned[index] = assign_group;
501			// The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
502			allocated_offset[index] = min_val[1];
503			if (adj[index].itf)
504				for (k = 0; k < adj[index].itf->rnum; k++)
505				{
506					const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[
index].itf)->rsize * (size_t)(k)));
507					if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !(
(tensor_blocks[d].flags & 0x3) == UNASSIGNED)))
508						--adj[d].oc;
509				}
510		}
511		uint64_t val[2] = {
512			a.size, min_val[1]
513		};
514		uint64_t consumed_size = 0;
515		// Go over from min_y to string_size (excluding min_x).
516		for (i = 0; i < string_size; i++)
517		{
518			const uint64_t size = tensor_blocks[strings[i] - 1].size;
519			assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 519, __extension__ __PRETTY_FUNCTION__
); }));
520			// Update consumed size if it is bigger than "size".
521			if (size > consumed_size)
522			{
523				val[0] = size - consumed_size;
524				ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
525				consumed_size = size;
526				val[1] = min_val[1] + consumed_size;
527			}
528			// If it consumed all the flow, break out.
529			if (consumed_size == a.size)
530				break;
531		}
532		for (i = 0; i < string_size; i++)
533		{
534			const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
535			uint64_t val[2] = {
536				i_size, min_val[1]
537			};
538			uint64_t consumed_size = 0;
539			for (k = i + 1; k < string_size; k++)
540			{
541				const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
 < _b) ? _a : _b; });
542				// Update consumed size if it is bigger than "size".
543				if (size > consumed_size)
544				{
545					val[0] = size - consumed_size;
546					ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
547					consumed_size = size;
548					val[1] = min_val[1] + consumed_size;
549				}
550				// If it consumed all the flow, break out.
551				if (consumed_size == i_size)
552					break;
553			}
554			val[0] = i_size - consumed_size;
555			// Still have residual, flow it to min_x.
556			if (val[0] > 0)
557				ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
558		}
559		if (min_i == -1)
560		{
561			// If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
562			const int p = strings[0] - 1;
563			const int q = strings[string_size - 1] - 1;
564			const int type = tensor_blocks[p].type;
565#define for_block(y, val) do { \
566				if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \
567				{ \
568					tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
569					if (tensor_blocks[y].companion_ref) \
570					{ \
571						const int companion_ref = tensor_blocks[y].companion_ref - 1; \
572						tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
573					} \
574				} \
575			} while(0)
576			ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
577			if (y_vector)
578				CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
 for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
579#undef for_block
580#define for_block(x, val) do { \
581				if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \
582				{ \
583					tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
584					if (tensor_blocks[x].companion_ref) \
585					{ \
586						const int companion_ref = tensor_blocks[x].companion_ref - 1; \
587						tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f)); \
588					} \
589				} \
590			} while(0)
591			ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
592			if (x_vector)
593				CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
 & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
 * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
 for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
594#undef for_block
595		}
596		j += string_size;
597	}
598	ccfreefree(tensor_block_cannot_insert);
599	ccfreefree(buf);
600	ccv_array_free(opt);
601	ccv_matrix_free(tensor_df);
602	ccv_matrix_free(tensor_dt);
603#define for_block(y, x, val) do { \
604		if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
605		{ \
606			if (!alloc_dep[x - 1]) \
607				alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
608			ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
609		} \
610	} while (0)
611	CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
 ((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
 _i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
 = (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
 _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
 !_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
 (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
 { switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
 = (alloc)->size; __attribute__((unused)) const size_t _c_
 = (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
 (alloc)->size; __attribute__((unused)) const size_t _c_ =
 (((alloc)->type) & 0xFFF); if ((alloc)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
 1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
 & 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
 + _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
 + _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
 0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
 } while (0);
612#undef for_block
613	ccv_matrix_free(alloc);
614	for (i = 0; i < tensor_block_size; i++)
615		if (adj[i].itf)
616			ccv_array_free(adj[i].itf);
617	ccfreefree(adj);
618	ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
619	alloc_prep->alloc_dep = alloc_dep;
620	alloc_prep->vt_block_size = tensor_block_size;
621	alloc_prep->buffer_size = num_assigned;
622	alloc_prep->block_size = available_tensor_size;
623	alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
624	alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
625	alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
626	memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
627	for (i = 0; i < num_assigned; i++)
628		alloc_prep->buffers[i].size = allocated_size[i];
629	if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels()))
630	{
631		size_t total_size = 0;
632		for (i = 0; i < num_assigned; i++)
633			total_size += allocated_size[i];
634		PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf
("Total buffer size of %zu to be allocated\n", total_size); fflush
(stdout); } } while (0);
635	}
636	ccfreefree(allocated_size);
637	j = 0;
638	// Assigning out the tensors (in case of sharing tensors / in-place ops).
639	for (i = 0; i < tensor_block_size; i++)
640		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
641		{
642			alloc_prep->blocks[j].block_ref = i;
643			if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
644			{
645				alloc_prep->vt_blocks[i] = j;
646				// Also, set its allocations.
647				assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 647, __extension__ __PRETTY_FUNCTION__
); }));
648				const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
649				alloc_prep->blocks[j].offset = allocated_offset[i];
650				if (!alloc_prep->buffers[buffer_ref].type)
651					alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
652				alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
653				alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
654				assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
 alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
 ({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 654, __extension__ __PRETTY_FUNCTION__
); }));
655			} else {
656				alloc_prep->vt_blocks[i] = -1;
657				alloc_prep->blocks[j].buffer_ref = -1;
658				alloc_prep->blocks[j].offset = 0;
659			}
660			++j;
661		} else
662			alloc_prep->vt_blocks[i] = -1;
663	ccfreefree(allocated_offset);
664	ccfreefree(assigned);
665	return alloc_prep;
666}
667 
668static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
669{
670	int i;
671	for (i = 0; i < alloc_prep->vt_block_size; i++)
672		if (alloc_prep->alloc_dep[i])
673			ccv_array_free(alloc_prep->alloc_dep[i]);
674	for (i = 0; i < alloc_prep->buffer_size; i++)
675		if (alloc_prep->buffers[i].dup_p_refs)
676			ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
677	ccfreefree(alloc_prep->alloc_dep);
678	ccfreefree(alloc_prep);
679}
680 
681// Simple allocator from ccv_array_t.
682static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
683{
684	int pos = tensor_metadata->rnum;
685	int rsize = (size + 15) / 16;
686	ccv_array_resize(tensor_metadata, pos + rsize);
687	return (pos << 1) + 1;
688}
689 
690static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
691{
692	assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 692, __extension__ __PRETTY_FUNCTION__
); }));
693	return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)));
694}
695 
696#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
697 
698static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
699{
700	// If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
701	if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
702		return vt_tensor;
703	ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
704	if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
705	{
706		const int alias_ref = tensor->alias_ref;
707		tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
708		_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
709	}
710	if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
711	{
712		ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
713		int i;
714		const int count = mv->kind + mv->repeat;
715		for (i = 0; i < count; i++)
716		{
717			if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1))
718			{
719				const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
720				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
721				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
722			}
723		}
724		// No need to recursively do parent pointer, otherwise we are in deep rewire.
725		if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
726			mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
727		if (mv->sp)
728			for (i = 0; i < mv->sp->rnum; i++)
729			{
730				ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
731				if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
732				{
733					const int pos = (int)(intptr_t)*tensor;
734					*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
735					assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 735, __extension__ __PRETTY_FUNCTION__
); }));
736					_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
737				}
738			}
739	}
740	return tensor;
741}
742 
743typedef struct {
744	const uint8_t* ptr;
745	int pos;
746} ccv_nnc_tensor_block_pos_t;
747 
748static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
749{
750	int i;
751	int unref_block_ref = block_ref;
752	while (prep->tensor_blocks[unref_block_ref].ref)
753		unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
754	int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
755	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 755, __extension__ __PRETTY_FUNCTION__); }));
756	assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
 == prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
 ("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 756, __extension__ __PRETTY_FUNCTION__
); }));
757	const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
758	uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
759	int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
760	for (i = idx - 1; i >= 0; i--)
761	{
762		assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
 (p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 762, __extension__ __PRETTY_FUNCTION__); }));
763		const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
764		const int unroll_count = graph_prep->unroll_count;
765		if (ch[i]) // Prefer the dup side of things.
766			p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
767		int unref_p_ref = p_ref;
768		while (graph_prep->tensor_blocks[unref_p_ref].ref)
769			unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
770		vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
771		const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
772		offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
773		// If the buffer already exists, prefer that.
774		const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
775		if (ptr)
776		{
777			// If I have any remaining path that is not covered from 0, I cannot possibly
778			// have any pointer from buffer (that can only happen if it is not dup).
779			for (--i; i >= 0; i--)
780				if (ch[i] != 0)
781					return 0;
782			// Try to find the created tensor block pos in the array, just linear scan.
783			const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
784			ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
785			*tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
786			ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
787			return tv_pos;
788		}
789		p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
790	}
791	return 0;
792}
793 
794// Descent from root to the prep level, and compose multiview from there.
795static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
796{
797	assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 797, __extension__ __PRETTY_FUNCTION__); }));
798	int i;
799	const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
800	const int unroll_count = prep->unroll_count;
801	if (prep == graph_prep)
802	{
803		const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
804		if (!data_pos)
805			return -1;
806		// Based on ch, go all the way back to find the exact pointer to compose.
807		if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
808			prep->dup_tensor_block_ref &&
809			prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
810			prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
811		{
812			int pos[unroll_count + 1];
813			pos[0] = data_pos;
814			for (i = 0; i < unroll_count; i++)
815				pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
816			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
817			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
818			ccv_nnc_tensor_t* data[unroll_count + 1];
819			for (i = 0; i < unroll_count + 1; i++)
820				data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
821			ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
822			for (i = 0; i < unroll_count + 1; i++)
823				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
824			*pos_ref = mv_pos;
825		} else {
826			*pos_ref = data_pos;
827		}
828		if (preserve)
829		{
830			// If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
831			// at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
832			// mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
833			// mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
834			// arena allocated).
835			// mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
836			// a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
837			// it to a K01 structure.
838			// Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
839			// to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
840			// memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
841			int prev_mv_pos = *pos_ref;
842			if (prev_mv_pos == -1)
843			{
844				prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
845				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
846				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
847				ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
848					tv,
849				}, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
850				CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
851			}
852			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
853			ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
854			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
855			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
856				CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
857				(ccv_nnc_tensor_t*)prev_mv,
858			}, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
859			prev_mv->p = (void*)(intptr_t)mv_pos;
860			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
861			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
862			*pos_ref = mv_pos;
863		}
864		return 0;
865	}
866	ch[idx] = 0;
867	int pos[unroll_count + 1];
868	pos[0] = 0;
869	const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
870	assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 870, __extension__ __PRETTY_FUNCTION__); }));
871	for (i = 0; i < unroll_count; i++)
872	{
873		ch[idx] = i + 1;
874		pos[i + 1] = 0;
875		const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
876		if (dup_retval < 0)
877		{
878			assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 878, __extension__ __PRETTY_FUNCTION__); }));
879			break;
880		}
881	}
882	// If current prep has no dup.
883	if (i == 0)
884	{
885		*pos_ref = pos[0];
886		return 0;
887	}
888	ccv_nnc_tensor_t* data[unroll_count + 1];
889	// Compose to a new multiview.
890	for (i = 0; i < unroll_count + 1; i++)
891		{ assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
 (pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 891, __extension__ __PRETTY_FUNCTION__); })); }
892	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
893	for (i = 0; i < unroll_count + 1; i++)
894		data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
895	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
896	ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
897	for (i = 0; i < unroll_count + 1; i++)
898		if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
899			((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
900	for (i = 0; i < unroll_count + 1; i++)
901		CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
902	*pos_ref = mv_pos;
903	return 0;
904}
905 
906static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
907{
908	int i;
909	int is_input = 0;
910	assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
 else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 910, __extension__ __PRETTY_FUNCTION__); }));
911	for (i = 0; i < node->input_size && !is_input; i++)
912		if (p_ref == node->inputs[i])
913			is_input = 1;
914	int is_output = 0;
915	for (i = 0; i < node->output_size && !is_output; i++)
916		if (p_ref == node->outputs[i])
917			is_output = 1;
918	// Prefer it is an output if it is both the input and the output.
919	if (is_output)
920		return 1;
921	if (is_input)
922		return -1;
923	return 0;
924}
925 
926static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
927{
928	// No need to check whether to preserve if this is not a while loop.
929	if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
930		return 0;
931	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 931, __extension__ __PRETTY_FUNCTION__
); }));
932	// If it is unassigned, no need to preserve.
933	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
934		return 0;
935	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
936	// If p is not input, no need to preserve at all.
937	if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
938		return 0;
939	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
940	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 940, __extension__ __PRETTY_FUNCTION__); }));
941	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 941, __extension__ __PRETTY_FUNCTION__
); }));
942	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
943	// If the buffer is a truly read-only one, no need to preserve.
944	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
945		return 0;
946	/* This needs detailed explanation, what does preserve mean?
947	 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
948	 * also used outside of the while loop, we cannot reuse the memory region of x for
949	 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
950	 * y uses the same memory region as x). The way to workaround this is by using a different
951	 * memory region for y = x + 1, but for the first iteration, having x pointing to the
952	 * original. During the allocation process, the way to identify whether x should preserve
953	 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
954	 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
955	 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
956	 * it is the input tensor whenever that is possible. A tensor block can point to two parent
957	 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
958	 * tensor whenever that is possible. */
959	if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
960		return 0;
961	// Otherwise, return 1 because we now need to preserve.
962	return 1;
963}
964 
965static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
966{
967	assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 967, __extension__ __PRETTY_FUNCTION__
); }));
968	// If it is unassigned, no need to preserve.
969	if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
 UNASSIGNED))
970		return 0;
971	// Only tape var need to force broadcast, otherwise we already share the same memory region.
972	if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
973		return 0;
974	const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
975	// If p is not output, no need to broadcast at all.
976	if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
977		return 0;
978	const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
979	assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 979, __extension__ __PRETTY_FUNCTION__); }));
980	assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
 graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
 __assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 980, __extension__ __PRETTY_FUNCTION__
); }));
981	const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
982	// If the buffer is a truly read-only one, no need to broadcast.
983	if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
 0xc) == READ_ONLY)
984		return 0;
985	// Otherwise, return 1 because we now need to force broadcast for this tape var.
986	return 1;
987}
988 
989static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
990{
991	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 991, __extension__ __PRETTY_FUNCTION__); }));
992	int i;
993	for (i = 0; i < mv->kind + mv->repeat; i++)
994		if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
995			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = tensor;
996		else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
997			_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i], tensor);
998}
999 
1000static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1001{
1002	assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1002, __extension__ __PRETTY_FUNCTION__); }));
1003	int i;
1004	if (mv->sp)
1005		for (i = 0; i < mv->sp->rnum; i++)
1006		{
1007			ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)));
1008			if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
1009			{
1010				const int pos = (int)(intptr_t)*tensor;
1011				*tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1012				assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
 "ccv_nnc_symbolic_graph_compile.c", 1012, __extension__ __PRETTY_FUNCTION__
); }));
1013				_ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1014			}
1015		}
1016	for (i = 0; i < mv->kind + mv->repeat; i++)
1017	{
1018		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]) & 1))
1019			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1020		if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
 : (mv)->_inline_data)[i]->alias_ref) & 1))
1021			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]->alias_ref);
1022		if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW))
1023			_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1024	}
1025}
1026 
1027static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1028{
1029	// Go to the root of the graph.
1030	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1031	int i;
1032	for (i = 1; prep->p; i++)
1033		prep = prep->p;
1034	// Root graph should have no dup tensor blocks.
1035	assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
 ({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1035, __extension__ __PRETTY_FUNCTION__); }));
1036	const int c = i;
1037	const ccv_nnc_symbolic_graph_prep_t* preps[c];
1038	prep = graph_prep;
1039	preps[c - 1] = prep;
1040	for (i = 0; prep->p; i++)
1041		preps[c - 2 - i] = prep = prep->p;
1042	int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1043	memset(ch, 0, sizeof(int) * c);
1044	int pos = 0;
1045	_ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1046	assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
 (ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1046, __extension__ __PRETTY_FUNCTION__); })); // This shouldn't never be modified.
1047	assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
 > 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1047, __extension__ __PRETTY_FUNCTION__); }));
1048	return pos;
1049}
1050 
1051static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1052{
1053	const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1054	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1055	ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1056	ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1057		CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1058		tv,
1059	}, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1060	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1061	CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = tensor;
1062	return mv_pos;
1063}
1064 
1065static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1066{
1067	ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1068	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1069	if (!is_multiview)
1070		return pos;
1071	while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1072	{
1073		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1074		tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1075	}
1076	const ccv_nnc_tensor_t tensor = *tensor_ptr;
1077	const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1078	ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1079	*new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1080	new_tensor->dataof = tensor.dataof;
1081	ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1082	new_tensor->alias_ref = (uintptr_t)pos;
1083	ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1084	return new_pos;
1085}
1086 
1087static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1088{
1089	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1090	// It referenced to is not an alias.
1091	assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__
 ({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1091, __extension__ __PRETTY_FUNCTION__
); }));
1092	const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1093	const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1094	assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1094, __extension__ __PRETTY_FUNCTION__
); }));
1095	// Will use that to determine whether insert reference or not.
1096	const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1097	while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1098	{
1099		const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1100		alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1101	}
1102	const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1103	// If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1104	int pos;
1105	if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1106		ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim))
1107	{
1108		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1109		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1110		*tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1111		tensor->dataof = alias_tensor.dataof;
1112	} else {
1113		pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1114		ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1115		// Otherwise initialize a tensor view
1116		*tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1117		tensor_view->alias_ref = (uintptr_t)alias_pos;
1118	}
1119	vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1120	if (is_multiview)
1121	{
1122		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1123		ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1124	}
1125}
1126 
1127static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1128{
1129	// If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1130	if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref])
1131	{
1132		const int ref = tensor_blocks[block_ref].alias_ref - 1;
1133		if (!vt_tensors[ref])
1134			_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1135		vt_tensors[block_ref] = vt_tensors[ref];
1136		return;
1137	}
1138	assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1138, __extension__ __PRETTY_FUNCTION__
); }));
1139	const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1140	// If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1141	if (!vt_tensors[alias_ref])
1142		_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1143	_ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1144}
1145 
1146// Turn a linear pointer to an object storage (such as MTLBuffer).
1147#ifdef HAVE_MPS
1148static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1149{
1150	mpobjfree(0, ptr);
1151}
1152#endif
1153 
1154typedef struct {
1155	size_t size;
1156	void* obj;
1157} tensor_arena_obj_track_t;
1158 
1159typedef struct {
1160	void* ptr;
1161	off_t offset;
1162	size_t size;
1163} obj_ptr_key_t;
1164 
1165static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1166{
1167	return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1168}
1169 
1170static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1171{
1172	return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1173}
1174 
1175KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied
, upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* *
vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__
)) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t
*)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__
 ((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if
 (h) { free((void *)h->keys); free(h->flags); free((void
 *)h->vals); free(h); } } static inline __attribute__ ((__unused__
)) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h
->flags) { memset(h->flags, 0xaa, ((h->n_buckets) <
 16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t
 key) { if (h->n_buckets) { khint_t k, i, last, mask, step
 = 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key
); i = k & mask; last = i; while (!((h->flags[i>>
4]>>((i&0xfU)<<1))&2) && (((h->
flags[i>>4]>>((i&0xfU)<<1))&1) || !
_kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step
)) & mask; if (i == last) return h->n_buckets; } return
 ((h->flags[i>>4]>>((i&0xfU)<<1))&
3)? h->n_buckets : i; } else return 0; } static inline __attribute__
 ((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t
 new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (
--(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1,
 (new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|=
(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>>
8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
 >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys
 = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets *
 sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return
 -1; } h->keys = new_keys; if (1) { void* *new_vals = (void
**)realloc((void *)h->vals,new_n_buckets * sizeof(void*));
 if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals
; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if
 (((h->flags[j>>4]>>((j&0xfU)<<1))&
3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t
 new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals
[j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<<
1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t
 tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp
; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<
1)); } else { h->keys[i] = key; if (1) h->vals[i] = val
; break; } } } } if (h->n_buckets > new_n_buckets) { h->
keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets
 * sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc
((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h->
flags); h->flags = new_flags; h->n_buckets = new_n_buckets
; h->n_occupied = h->size; h->upper_bound = (khint_t
)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static
 inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t
 *h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied
 >= h->upper_bound) { if (h->n_buckets > (h->size
<<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) <
 0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr
(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets
; } } { khint_t k, i, site, last, mask = h->n_buckets - 1,
 step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
 (!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key
))) { if (((h->flags[i>>4]>>((i&0xfU)<<
1))&1)) site = i; i = (i + (++step)) & mask; if (i ==
 last) { x = site; break; } } if (x == h->n_buckets) { if (
((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && site != h->n_buckets) x = site; else x = i; }
 } } if (((h->flags[x>>4]>>((x&0xfU)<<
1))&2)) { h->keys[x] = key; (h->flags[x>>4]&=
~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->
n_occupied; *ret = 1; } else if (((h->flags[x>>4]>>
((x&0xfU)<<1))&1)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; *ret = 2; } else *ret = 0; return x; } static
 inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t
 *h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
 --h->size; } }
1176 
1177static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1178{
1179	if (params.dim[0] == 0)
1180		return 0;
1181#ifdef HAVE_MPS
1182	if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY)
1183	{
1184		int ret;
1185		const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >>
 12] * ccv_nnc_tensor_count(params);
1186		const obj_ptr_key_t key = {
1187			.ptr = ptr,
1188			.offset = offset,
1189			.size = size,
1190		};
1191		khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret);
1192		if (ret != 0)
1193		{
1194			void* obj = mpobjcreate(ptr, offset, size);
1195			if (!tensor_arena->disposers)
1196				tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1197			ccv_nnc_arena_disposer_t disposer = {
1198				.ptr = obj,
1199				.userdata = 0,
1200				.dispose = _ccv_nnc_tensor_arena_obj_dispose
1201			};
1202			ccv_array_push(tensor_arena->disposers, &disposer);
1203			kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj;
1204			return obj;
1205		} else
1206			return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]);
1207	}
1208#endif
1209	return ptr + offset;
1210}
1211 
1212static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1213{
1214	// All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1215	// Each tensor have the designation in assigned array, and offset in allocated_offset.
1216	const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1217	ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1218	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1219	const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1220	const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1221	const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1222	const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1223	const int unroll_count = graph_prep->unroll_count;
1224	int i, j;
1225	for (i = 0; i < tensor_symbol_info_size; i++)
1226		for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1227		{
1228			const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1229			if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1230				TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
);
1231		}
1232	ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1233	graph_prep->tensor_arena = tensor_arena;
1234	tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1235	tensor_arena->buffers = (void*)(tensor_arena + 1);
1236	tensor_arena->buffer_size = alloc_prep->buffer_size;
1237	tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1238	tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1239	tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1240	tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1241	tensor_arena->pb_vt_tensors = 0;
1242	tensor_arena->vt_alias_r_refs_p = 0;
1243	tensor_arena->vt_alias_r_refs = 0;
1244	tensor_arena->vt_sizes = 0;
1245	tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1246	tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, 0, 0);
1247	tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1248	tensor_arena->allocator.context.free = allocator.context.free;
1249	tensor_arena->allocator.isa = allocator.isa;
1250	tensor_arena->disposers = 0;
1251	// Copy alias_ref info back to the tensor arena.
1252	for (i = 0; i < tensor_symbol_info_size; i++)
1253		tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1254	// Do the buffer copies.
1255	for (i = 0; i < alloc_prep->buffer_size; i++)
1256		tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1257			tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1258			tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1259	if (graph_prep->while_count_tensor)
1260	{
1261		// If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1262		int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1263		assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
 ({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1263, __extension__ __PRETTY_FUNCTION__
); })); // pos must be 0 position.
1264		ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1265		*tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1266	}
1267	assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
 && !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
 && p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1267, __extension__ __PRETTY_FUNCTION__
); }));
1268	if (p_arena && p_graph_prep)
1269	{
1270		// Don't need to allocate the actual buffer, just use the pointer from the above.
1271		PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer assignment for sub arena %p (parent %p)\n",
 tensor_arena, p_arena); fflush(stdout); } } while (0);
1272		for (i = 0; i < tensor_arena->buffer_size; i++)
1273		{
1274			const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1275			int unref_p_ref = p_ref;
1276			while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1277				unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1278			assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
 ({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1278, __extension__ __PRETTY_FUNCTION__
); }));
1279			const int p_unroll_count = p_graph_prep->unroll_count;
1280			if (p_graph_prep->dup_tensor_block_ref &&
1281				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1282				p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1283			{
1284				// This condition means in the parent graph, we point to multiple tensor blocks for the same
1285				// buffer, therefore, we cannot have one single pointer assigned in this case.
1286				// Later we will handle this by generate ccv_tensor_multiview_t structure.
1287				tensor_arena->buffers[i].ptr = 0;
1288				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1289				continue;
1290			}
1291			// Otherwise, find the actual buffer pointer.
1292			const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1293			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1293, __extension__ __PRETTY_FUNCTION__); }));
1294			const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1295			if (!p_arena->buffers[buffer_ref].ptr)
1296			{
1297				// Pass it down as 0 ptr.
1298				tensor_arena->buffers[i].ptr = 0;
1299				PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0);
1300				continue;
1301			}
1302			const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1303			tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1304			PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
 (0);
1305		}
1306	} else {
1307		// Now, allocate actual buffers.
1308		PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0);
1309		for (i = 0; i < tensor_arena->buffer_size; i++)
1310		{
1311			const int buffer_type = tensor_arena->buffers[i].type;
1312			const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1313#ifdef HAVE_CUDA1
1314			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1315			{
1316				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1317				if (allocator.isa && allocator.isa->alloc)
1318					tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1319				else
1320					tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1321				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1322			} else {
1323				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1323, __extension__ __PRETTY_FUNCTION__
); }));
1324				if (tensor_arena->buffers[i].pin_mem)
1325					tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1326				else
1327					ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1328				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1329			}
1330#elif defined(HAVE_MPS)
1331			if (memory_type == CCV_TENSOR_GPU_MEMORY)
1332			{
1333				const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1334				// if (allocator.isa && allocator.isa->alloc)
1335				// 	tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1336				// else
1337				tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1338				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1339			} else {
1340				assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1340, __extension__ __PRETTY_FUNCTION__
); }));
1341				ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1342				PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1343			}
1344#else
1345			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1345, __extension__ __PRETTY_FUNCTION__
); }));
1346			ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1347			PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
 { printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0);
1348#endif
1349			assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
 ({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
 ("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1349, __extension__ __PRETTY_FUNCTION__); }));
1350		}
1351	}
1352	// Go over sub_preps and allocate arenas for them. Do it this early because
1353	// we may reference tensors from sub arenas, the reason why we need to reference
1354	// tensors from sub arenas is because for output tensors, sub arena's tensor
1355	// will have automatic reference updates.
1356	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1357		if (graph_prep->sub_preps[i])
1358			tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1359		else
1360			tensor_arena->sub_arenas[i] = 0;
1361	memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1362	// Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1363	ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1364#ifdef HAVE_MPS
1365	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr();
1366#else
1367	khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0;
1368#endif
1369	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1370		if (tensor_arena->sub_arenas[i])
1371		{
1372			assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
 ({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1372, __extension__ __PRETTY_FUNCTION__
); }));
1373			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1374			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1375			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1376				for (j = 0; j < node->output_size; j++)
1377				{
1378					const int idx = node->outputs[j];
1379					const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1;
1380					assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
 (s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1380, __extension__ __PRETTY_FUNCTION__); }));
1381					ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1382					assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
 ({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
 ("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1382, __extension__ __PRETTY_FUNCTION__); }));
1383					ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1384					// Only assign if it is a multiview tensor.
1385					if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1386						(sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1387						sub_arena_out_tensors[idx] = sub_tensor;
1388				}
1389		}
1390	// Assigning out the tensors (in case of sharing tensors / in-place ops).
1391	for (i = 0; i < tensor_symbol_info_size; i++)
1392		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
1393		{
1394			const int vt_ref = alloc_prep->vt_blocks[i];
1395			const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1396			// Either we have dup_tensor_block_ref in current layer, or we have that in
1397			// previous layer, therefore, cannot really find the buffer ptr.
1398			if ((!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1399				((graph_prep->dup_tensor_block_ref &&
1400				  graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1401				  graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1402				 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1403			{
1404				assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1404, __extension__ __PRETTY_FUNCTION__
); })); // This must be in a sub-graph.
1405				// If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1406				if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1407					continue;
1408				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1409				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1410				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1411			} else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1412				// When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1413				const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1414				// If already created, use the same tensor, and continue.
1415				// Having ptr.
1416				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1417				ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1418				// Also, set its allocations.
1419				// Since tensor view is bit compatible with tensor, we can just cast.
1420				void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1421				*tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1422				assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1422, __extension__ __PRETTY_FUNCTION__
); }));
1423				// If we need to force broadcast, we need to wrap it in a multiview.
1424				if (graph_prep->tensor_blocks[i].p_refs[0] &&
1425					_ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1426				{
1427					const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1428					ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1429					ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1430					ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1431						tv,
1432					}, 0, 1, graph_prep->graph, mv);
1433					CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1434					pos = mv_pos;
1435					ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1436				}
1437				tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1438			}
1439		}
1440#ifdef HAVE_MPS
1441	kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map);
1442#endif
1443	// Handle binded tensors. First handle cases without aliases.
1444	for (i = 0; i < tensor_bind_size; i++)
1445	{
1446		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1446, __extension__ __PRETTY_FUNCTION__
); }));
1447		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1448		if (resolved_symbol.d >= 0)
1449		{
1450			int d = resolved_symbol.d;
1451			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1452				continue;
1453			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1454			// It has nothing to do with alias.
1455			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1456				d = tensor_blocks[d].ref - 1;
1457			// For binded tensors, it shouldn't be assigned yet.
1458			// If it is assigned, the pointer should match the ones from the binded tensor.
1459			// This can only happen if an enforced in-place tensor is binded twice. If that
1460			// happens, we need to make sure it is binded to the same location.
1461			assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1461, __extension__ __PRETTY_FUNCTION__
); }));
1462			// See above assertion.
1463			if (tensor_arena->vt_tensors[d])
1464				continue;
1465			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1466			{
1467				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1468				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1469				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1470				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1471					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1472						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1472, __extension__ __PRETTY_FUNCTION__
); })); }
1473				// It is OK to be just as a whole smaller or equal to the binded one.
1474				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1474, __extension__ __PRETTY_FUNCTION__
); }));
1475				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1476				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1477				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1478			} else {
1479				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1480				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1481				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1482				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1483				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1484				tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1485				tv->dataof = tensor_binds[i].tensor->dataof;
1486				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1487			}
1488		}
1489	}
1490	// Handle binded tensors. We handle alias here so it can reference to binded tensors.
1491	for (i = 0; i < tensor_bind_size; i++)
1492	{
1493		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1493, __extension__ __PRETTY_FUNCTION__
); }));
1494		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1495		if (resolved_symbol.d >= 0)
1496		{
1497			int d = resolved_symbol.d;
1498			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1499				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1500			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1501			// It has nothing to do with alias.
1502			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1503				d = tensor_blocks[d].ref - 1;
1504			if (tensor_arena->vt_tensors[d])
1505				continue;
1506			// Assert original alias has no ofs. Otherwise our binding will be problematic.
1507			for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1508				{ assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j]
 == 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol
.d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1508, __extension__ __PRETTY_FUNCTION__
); })); }
1509			if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1510			{
1511				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1512				ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1513				ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1514				if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1515					for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1516						{ assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
 ("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1516, __extension__ __PRETTY_FUNCTION__
); })); }
1517				// It is OK to be just as a whole smaller or equal to the binded one.
1518				assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1518, __extension__ __PRETTY_FUNCTION__
); }));
1519				memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1520				memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1521				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1522			} else {
1523				int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1524				ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1525				*tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1526				tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1527				tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1528				tv->data = tensor_binds[i].tensor->data;
1529				tv->dataof = tensor_binds[i].tensor->dataof;
1530				tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1531			}
1532		}
1533	}
1534	// Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1535	// Avoiding refs that actually is an alias.
1536	for (i = 0; i < tensor_symbol_info_size; i++)
1537		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1538		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref)
1539		{
1540			int ref = tensor_blocks[i].ref - 1;
1541			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1542				ref = tensor_blocks[ref].ref - 1;
1543			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1543, __extension__ __PRETTY_FUNCTION__); }));
1544			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1545		}
1546	// Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1547	if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1548	{
1549		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1549, __extension__ __PRETTY_FUNCTION__
); }));
1550		const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1551		const int p_idx = graph_prep->p_idx - 1;
1552		for (i = 0; i < node->input_size; i++)
1553		{
1554			const int idx = node->inputs[i];
1555			int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx))) - 1;
1556			assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
 ({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1556, __extension__ __PRETTY_FUNCTION__); }));
1557			const int vt_ref = alloc_prep->vt_blocks[block_ref];
1558			if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1559				continue;
1560			assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
 (vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1560, __extension__ __PRETTY_FUNCTION__); }));
1561			const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1562			assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1562, __extension__ __PRETTY_FUNCTION__); }));
1563			assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
 == ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1563, __extension__ __PRETTY_FUNCTION__
); }));
1564			// Either we have dup_tensor_block_ref in current layer, or we have that in
1565			// previous layer, therefore, cannot really find the buffer ptr.
1566			if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1567				((graph_prep->dup_tensor_block_ref &&
1568				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1569				  graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1570				 !tensor_arena->buffers[buffer_ref].ptr))
1571			{
1572				// We haven't allocated anything for this yet.
1573				assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
 ? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1573, __extension__ __PRETTY_FUNCTION__
); }));
1574				const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1575				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1576				ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1577			} else {
1578				const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1579				tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1580				ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1581			}
1582		}
1583	}
1584	// For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1585	// This created the multi-view tensor to achieve that.
1586	for (i = 0; i < tensor_symbol_info_size; i++)
1587		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1588		{
1589			const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1590			// Create phi multi-view.
1591			const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1592			const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1593			const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1594			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1595			ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1596			ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1597			ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1598				intv,
1599				outv,
1600			}, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1601			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1602			CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1603			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1604			ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1605		}
1606	// Now it is time to handle alias.
1607	for (i = 0; i < alloc_prep->block_size; i++)
1608		if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1609		{
1610			const int block_ref = alloc_prep->blocks[i].block_ref;
1611			if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref])
1612			{
1613				// Assigning out the tensor aliases.
1614				assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1614, __extension__ __PRETTY_FUNCTION__
); }));
1615				_ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1616			}
1617		}
1618	// Now assigning out the rest of alias refs.
1619	for (i = 0; i < tensor_symbol_info_size; i++)
1620		// It could be binded tensor (or unused), in that case, it doesn't have a ref.
1621		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i])
1622		{
1623			int ref = tensor_blocks[i].alias_ref - 1;
1624			assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
 ({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
 ("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1624, __extension__ __PRETTY_FUNCTION__); }));
1625			tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1626		}
1627	// Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1628	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1629		if (tensor_arena->sub_arenas[i])
1630		{
1631			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1632			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1633			for (j = 0; j < node->input_size; j++)
1634			{
1635				const int idx = node->inputs[j];
1636				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1637				if (s_idx < 0)
1638					continue;
1639				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1640				// Only do the replacement if it is a multi-view tensor.
1641				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1642				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1643				{
1644					// It cannot be binded tensor.
1645					assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1645, __extension__ __PRETTY_FUNCTION__
); }));
1646					const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1647					const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1648					ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1649					// If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1650					// to this tensor.
1651					if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1652					{
1653						const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1654						ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1655						ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1656						ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1657						ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1658						ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
 : (multiview)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)[0]);
1659						while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1660							tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
 ? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)[0]);
1661						*ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1662						ref_tensor->data = tv->data;
1663						ref_tensor->dataof = tv->dataof;
1664						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1665					} else
1666						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1667				}
1668			}
1669		}
1670	// After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1671	// No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1672	// when initialize case..of node, which will take the phi multi-view again.
1673	for (i = 0; i < tensor_symbol_info_size; i++)
1674		if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1675		{
1676			assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
 & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1676, __extension__ __PRETTY_FUNCTION__
); }));
1677			ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1678			assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1678, __extension__ __PRETTY_FUNCTION__); }));
1679			tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1680		}
1681	// rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1682	for (i = 0; i < tensor_symbol_info_size; i++)
1683		if (tensor_arena->vt_tensors[i])
1684			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1685	// Associate multiview tensors from sub arena to the parent.
1686	if (sub_arena_out_tensors)
1687	{
1688		for (i = 0; i < alloc_prep->block_size; i++)
1689			if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1690			{
1691				const int block_ref = alloc_prep->blocks[i].block_ref;
1692				if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1693					continue;
1694				int sub_arena_ref = block_ref;
1695				if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1696				{
1697					// Assigning out the tensor aliases.
1698					assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
 : 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1698, __extension__ __PRETTY_FUNCTION__
); }));
1699					const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1700					// It referenced to is not an alias.
1701					assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
 0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1701, __extension__ __PRETTY_FUNCTION__
); }));
1702					sub_arena_ref = alias_ref;
1703					if (!sub_arena_out_tensors[sub_arena_ref])
1704						continue;
1705				}
1706				if (!sub_arena_out_tensors[sub_arena_ref])
1707					continue;
1708				ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1709				assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
 1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1709, __extension__ __PRETTY_FUNCTION__); }));
1710				// This is only possible if the vt_tensors is a phi node.
1711				if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1712				{
1713					// For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1714					ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1715					assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
 ({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
 ("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1715, __extension__ __PRETTY_FUNCTION__); }));
1716					assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
 ? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1716, __extension__ __PRETTY_FUNCTION__
); }));
1717					CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]->alias_ref = (uintptr_t)mv;
1718					ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)[1]);
1719				} else {
1720					tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1721					ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1722				}
1723			}
1724	}
1725	// Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1726	// 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1727	// 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1728	// Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1729	// to the output of assign_ref tensor.
1730	for (i = 0; i < tensor_symbol_info_size; i++)
1731		if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1732		{
1733			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1734			ccv_nnc_tensor_t* assign_tensor;
1735			if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1736				assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
) ? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1737			else
1738				assign_tensor = tensor_arena->vt_tensors[assign_ref];
1739			ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1740		}
1741	// After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1742	for (i = 0; i < tensor_bind_size; i++)
1743	{
1744		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1744, __extension__ __PRETTY_FUNCTION__
); }));
1745		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1746		if (resolved_symbol.d >= 0)
1747		{
1748			int d = resolved_symbol.d;
1749			// This check is for in-place ops. Only in-place op could have unassigned but ref.
1750			// It has nothing to do with alias.
1751			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1752				d = tensor_blocks[d].ref - 1;
1753			// Note we don't trace back on alias. This is intentional.
1754			assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 ==
 tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ (
{ if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1754, __extension__ __PRETTY_FUNCTION__
); }));
1755		}
1756	}
1757	if (sub_arena_out_tensors)
1758		ccfreefree(sub_arena_out_tensors);
1759	// Rewire sub arena's tensor references.
1760	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1761		if (tensor_arena->sub_arenas[i])
1762		{
1763			const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1764			const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1765			for (j = 0; j < node->input_size; j++)
1766			{
1767				const int idx = node->inputs[j];
1768				const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
 (size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i))) - 1 : -1;
1769				if (s_idx < 0)
1770					continue;
1771				ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1772				// Only do the replacement if it is a multi-view tensor.
1773				// sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1774				if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1775				{
1776					// This is binded tensor, bind it now.
1777					if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1778						_ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1779					else
1780						_ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1781				}
1782			}
1783		}
1784	return tensor_arena;
1785}
1786 
1787static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1788{
1789	assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
 ; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1789, __extension__ __PRETTY_FUNCTION__); }));
1790	if ((intptr_t)graph == tensor_arena->graph_ref)
1791	{
1792		assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >=
 0 && pair_ref < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1792, __extension__ __PRETTY_FUNCTION__
); }));
1793		return tensor_arena->vt_tensors[pair_ref];
1794	}
1795	int i;
1796	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1797		if (tensor_arena->sub_arenas[i])
1798		{
1799			ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1800			if (tensor)
1801				return tensor;
1802		}
1803	return 0;
1804}
1805 
1806static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1807{
1808	if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1809		tensor->type |= CCV_TAPE_ALLOC;
1810	else {
1811		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1812		mv->type |= CCV_TAPE_ALLOC;
1813		int i;
1814		for (i = 0; i < mv->repeat + mv->kind; i++)
1815			_ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i]);
1816	}
1817}
1818 
1819static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1820{
1821	assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
 __assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1821, __extension__ __PRETTY_FUNCTION__
); }));
1822	int i;
1823	for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1824	{
1825		if (graph_prep->tensor_symbol_info[i].pair_ref)
1826		{
1827			tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1828			// No need to continue check this if it is from its pair.
1829			continue;
1830		}
1831		if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1832		{
1833			// If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1834			if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
))
1835			{
1836				const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1837				if (vt_ref >= 0 &&
1838					TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc) == READ_ONLY)
1839					continue;
1840			}
1841			_ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1842		}
1843	}
1844	for (i = 0; i < graph_prep->sub_prep_size; i++)
1845		if (graph_prep->sub_preps[i])
1846			_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1847}
1848 
1849static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1850{
1851	int i, found = 0;
1852	// Try to insert head.
1853	ccv_array_t* head = tensor_blocks.head;
1854	assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
 else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1854, __extension__ __PRETTY_FUNCTION__); }));
1855	for (i = 0; i < head->rnum;)
1856	{
1857		const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i)));
1858		if (head_idx == idx)
1859		{
1860			found = 1;
1861			break;
1862		}
1863		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1864		if (cell.i32 && cell.i32[0] > 0)
1865		{
1866			/* If the current node is the parent of the head node, check if we found it or not. */
1867			/* If not found, replace the current one. */
1868			if (!found)
1869			{
1870				found = 1;
1871				*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = idx;
1872			} else {
1873				/* Remove the current one, change the rnum. */
1874				if (i < head->rnum - 1)
1875					*(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(head->rnum - 1)));
1876				--head->rnum;
1877				continue;
1878			}
1879		} else {
1880			// If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1881			cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1882			if (cell.i32 && cell.i32[0] > 0)
1883			{
1884				found = 1;
1885				break;
1886			}
1887		}
1888		/* Advancing i. */
1889		++i;
1890	}
1891	/* If not found, push this idx to the end of the array. */
1892	if (!found)
1893		ccv_array_push(head, &idx);
1894	// Try to insert tail.
1895	found = 0;
1896	ccv_array_t* tail = tensor_blocks.tail;
1897	assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
 else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1897, __extension__ __PRETTY_FUNCTION__); }));
1898	for (i = 0; i < tail->rnum;)
1899	{
1900		const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i)));
1901		if (tail_idx == idx)
1902		{
1903			found = 1;
1904			break;
1905		}
1906		ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1907		if (cell.i32 && cell.i32[0] > 0)
1908		{
1909			/* If the current node is the child of the tail node, check if we found it or not. */
1910			/* If not found, replace the current one. */
1911			if (!found)
1912			{
1913				found = 1;
1914				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = idx;
1915			} else {
1916				/* Remove the current one, change the rnum. */
1917				*(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(i))) = *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
 * (size_t)(tail->rnum - 1)));
1918				--tail->rnum;
1919				continue;
1920			}
1921		} else {
1922			// If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1923			cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1924			if (cell.i32 && cell.i32[0] > 0)
1925			{
1926				found = 1;
1927				break;
1928			}
1929		}
1930		/* Advancing i. */
1931		++i;
1932	}
1933	/* If not found, push this idx to the end of the array. */
1934	if (!found)
1935		ccv_array_push(tail, &idx);
1936}
1937 
1938ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1939{
1940	if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1941	{
1942		assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
 0 && symbol.d < tensor_arena->vt_tensor_size) ;
 else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1942, __extension__ __PRETTY_FUNCTION__
); }));
1943		ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1944		if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1945		{
1946			ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1947			while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1948				mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
1949			return (ccv_nnc_tensor_t*)mv;
1950		}
1951		return tensor;
1952	}
1953	int i;
1954	for (i = 0; i < tensor_arena->sub_arena_size; i++)
1955		if (tensor_arena->sub_arenas[i])
1956		{
1957			ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1958			if (tensor)
1959				return tensor;
1960		}
1961	return 0;
1962}
1963 
1964ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1965{
1966	if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1967	{
1968		assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
 >= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 1968, __extension__ __PRETTY_FUNCTION__
); }));
1969		return graph_exec_arena->graph_execs[symbol.d];
1970	}
1971	int i;
1972	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
1973		if (graph_exec_arena->sub_arenas[i])
1974		{
1975			ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
1976			if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
1977				return exec;
1978		}
1979	return (ccv_nnc_graph_exec_t){}; // 0.
1980}
1981 
1982ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1983{
1984	return graph_exec_arena->source;
1985}
1986 
1987ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
1988{
1989	return graph_exec_arena->destination;
1990}
1991 
1992// Check whether the head is the beginning of this block.
1993static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
1994{
1995	assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
 ({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 1995, __extension__ __PRETTY_FUNCTION__
); }));
1996	return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0))) == head_node);
1997}
1998 
1999// Check whether the tail is the end of this block.
2000static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2001{
2002	assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
 ({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 2002, __extension__ __PRETTY_FUNCTION__
); }));
2003	return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0))) == tail_node);
2004}
2005 
2006// Make two tensor blocks one. Return 1 if that happened.
2007static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2008{
2009	// Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2010	if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
2011		(!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
2012		tensor_blocks[p_ref_0].tail->rnum == 1 &&
2013		tensor_blocks[p_ref_1].head->rnum == 1 &&
2014		tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
2015		*(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
) == *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
))
2016	{
2017		// If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2018		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 2018, __extension__ __PRETTY_FUNCTION__); }));
2019		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
 ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
 == UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
 ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 2019, __extension__ __PRETTY_FUNCTION__); }));
2020		ccv_array_free(tensor_blocks[p_ref_0].tail);
2021		tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2022		if (tensor_blocks[p_ref_1].p_refs[0])
2023		{
2024			assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
 0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
 0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2024, __extension__ __PRETTY_FUNCTION__
); })); // It simply cannot have more than one p_refs, otherwise we cannot merge.
2025			if (!tensor_blocks[p_ref_0].p_refs[0])
2026				tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2027			else
2028				tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2029		}
2030		tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2031		TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
 & ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)));
2032		ccv_array_free(tensor_blocks[p_ref_1].head);
2033		if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
2034			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
 | UNFOLDABLE_AS_INPUT));
2035		// Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2036		TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
 & ~0x3) | UNASSIGNED));
2037		tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2038		if (!tensor_blocks[p_ref_0].r_refs)
2039			tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2040		ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2041		tensor_blocks[p_ref_1].size = 0;
2042		tensor_blocks[p_ref_1].head = 0;
2043		tensor_blocks[p_ref_1].tail = 0;
2044		return 1;
2045	}
2046	return 0;
2047}
2048 
2049static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2050{
2051	int i, j, k;
2052	// Generate exec dependencies (or, in other words, partial ordering of executions).
2053	ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2054	int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2055	int buf_size;
2056	if (p_node_info)
2057		{ assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
 if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2057, __extension__ __PRETTY_FUNCTION__
); })); }
2058#define for_block(x, val) \
2059	do { \
2060		if (((int32_t*)val)[0] > 0) \
2061		{ \
2062			buf[buf_size * 2] = x; \
2063			buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2064			++buf_size; \
2065		} \
2066	} while (0)
2067	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx; {
2068		buf_size = 0; /* save all its parent deps to this buffer */
2069		ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2070		if (vector)
2071			CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
 { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
 CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
 -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
 (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
 * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
 _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
 { int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
 size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
 = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
 = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
 (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
 = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0);
2072		if (!node->outgoings)
2073			continue;
2074		for (i = 0; i < node->outgoings->rnum; i++)
2075		{
2076			int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2077			const int32_t one = 1;
2078			ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2079			/* If not found, set, if the current node is the destination node, no need 
2080			 * set itself as parent of subsequent nodes because its terminal nature. */
2081			if (!cell.i32 || cell.i32[0] == 0)
2082				ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2083			if (buf_size > 0)
2084			{
2085				ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2086				assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector
) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c"
, 2086, __extension__ __PRETTY_FUNCTION__); }));
2087				for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
2088				{
2089					ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2090					/* If not found, set */
2091					if (!cell.i32 || cell.i32[0] == 0)
2092						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2093					else {
2094						/* Otherwise, set to the longest one */
2095						int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
 + 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; });
2096						ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2097					}
2098				}
2099			}
2100		}
2101	} ccv_nnc_graph_visit_endfor} }
2102#undef for_block
2103	ccfreefree(buf);
2104	// This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2105	const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2106	ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2107	// The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2108	// happens that I have to loop through all relevant node to find out if one is used or not.
2109	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2110		tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2111	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2112		for (i = 0; i < node->input_size; i++)
2113			if (node->inputs[i] >= 0)
2114			{
2115				tensor_blocks[node->inputs[i]].flags = 0;
2116				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2117				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2118				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2119					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2120					tensor_blocks[node->inputs[i]].pin_mem = 1;
2121			}
2122		for (i = 0; i < node->output_size; i++)
2123			if (node->outputs[i] >= 0)
2124			{
2125				tensor_blocks[node->outputs[i]].flags = 0;
2126				// If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2127				// This will get propagated back to the buffer, and used there to determine the allocation function to use.
2128				if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2129					(node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2130					tensor_blocks[node->outputs[i]].pin_mem = 1;
2131			}
2132	} ccv_nnc_graph_visit_endfor} }
2133	if (p_node_info)
2134	{
2135		assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
 ({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 2135, __extension__ __PRETTY_FUNCTION__
); }));
2136		// Mark it as used if it is used in either input or output.
2137		for (i = 0; i < p_node_info->input_size; i++)
2138			if (p_node_info->inputs[i] >= 0)
2139			{
2140				const int d = p_node_info->inputs[i];
2141				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2142				{
2143					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2144					if (dd >= 0) // If this exists in this sub-graph, great.
2145						tensor_blocks[dd].flags = 0;
2146				}
2147			}
2148		for (i = 0; i < p_node_info->output_size; i++)
2149			if (p_node_info->outputs[i] >= 0)
2150			{
2151				const int d = p_node_info->outputs[i];
2152				if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2153				{
2154					const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1))) - 1;
2155					if (dd >= 0) // If this exists in this sub-graph, great.
2156						tensor_blocks[dd].flags = 0;
2157				}
2158			}
2159	}
2160	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2161		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2162		{
2163			// Check no tensor info is auto now.
2164			assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 2164, __extension__ __PRETTY_FUNCTION__
); }));
2165			// If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2166			// therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2167			// fold to).
2168			if (tensor_symbol_info[i].assign_ref)
2169			{
2170				// TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2171				// It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2172				// it kept its own representation, which is not the case for output).
2173				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2174				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2175				// But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2176				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT));
2177				// It also cannot be folded as output (except i), because we need to keep its own representation.
2178				TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2179				assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
 == 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2179, __extension__ __PRETTY_FUNCTION__
); }));
2180				tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2181				for (j = 0; j < unroll_count; j++)
2182				{
2183					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT));
2184					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
 = (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT));
2185				}
2186				if (tensor_blocks[assign_ref].bypass_ref)
2187				{
2188					// If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2189					tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2190					const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2191					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT));
2192					TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT));
2193					// On the other hand, it can be folded into the except_ref for the bypass_ref.
2194					tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2195					if (dup_tensor_from_ref)
2196					{
2197						const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2198						if (bypass_from_ref >= 0)
2199						{
2200							TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT));
2201							TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT));
2202							assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
 + unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
 - 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2202, __extension__ __PRETTY_FUNCTION__
); }));
2203							for (j = 0; j < unroll_count - 1; j++)
2204							{
2205								// Mark every incarnation as unfold-able.
2206								TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_INPUT));
2207								TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
 + j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
 * unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT));
2208							}
2209						}
2210					}
2211				}
2212			}
2213		}
2214	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2215	{
2216		// If it has a pair reference, we don't need to allocate this tensor at all,
2217		// set it to be unassigned.
2218		if (tensor_symbol_info[i].pair_ref)
2219			TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED));
2220		// If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2221		else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2222			TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2223			TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
));
2224			// For this case, there is no exception.
2225			tensor_blocks[i].unfoldable_except_ref = 0;
2226		} else if (tensor_symbol_info[i].p_ref) {
2227			assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 2227, __extension__ __PRETTY_FUNCTION__); }));
2228			const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2229			// If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2230			if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2231				// TODO: This check can be lifted if we can fold in the parent graph.
2232				if (-1 == p_ref_is_in_or_out)
2233					TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2234			if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2235				TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
));
2236		}
2237	}
2238	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2239	{
2240		if (tensor_symbol_info[i].alias_ref)
2241		{
2242			const int ref = tensor_symbol_info[i].alias_ref - 1;
2243			// If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2244			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2245				tensor_blocks[ref].flags = 0;
2246			// An alias cannot ref to another alias.
2247			assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
 __assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2247, __extension__ __PRETTY_FUNCTION__); }));
2248			tensor_blocks[i].flags = ALIAS;
2249			tensor_blocks[i].ref = ref + 1; // Assign the ref.
2250			if (!tensor_blocks[ref].r_refs)
2251				tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2252			ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2253		}
2254	}
2255	// Scan again and if the ref is not assigned, mark the alias not assigned.
2256	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2257		if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2258		{
2259			const int ref = tensor_blocks[i].ref - 1;
2260			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2261			{
2262				// Mark this as unassigned.
2263				tensor_blocks[i].flags = UNASSIGNED;
2264				tensor_blocks[i].ref = 0;
2265			}
2266		}
2267	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2268	{
2269		// If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2270		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)))
2271		{
2272			tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2273			tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2274			// Cache tensor size (align to 16 bytes).
2275			tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2276		}
2277		// If there is a p_ref, add the one to the p_refs list.
2278		if (tensor_symbol_info[i].p_ref)
2279			tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2280	}
2281	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2282		for (i = 0; i < node->input_size; i++)
2283		{
2284			int d = node->inputs[i];
2285			if (d < 0)
2286				continue;
2287			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2288				d = tensor_symbol_info[d].alias_ref - 1;
2289			tensor_blocks[d].flags |= READ_ONLY;
2290			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2291				continue;
2292			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2292, __extension__ __PRETTY_FUNCTION__
); }));
2293			/* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2294			 * from the very beginning of the graph life-cycle and ends here. */
2295			if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
2296			{
2297				for (j = 0; j < source_size; j++)
2298				{
2299					// If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2300					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2301					if (cell.i32 && cell.i32[0] > 0)
2302						_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2303				}
2304				/* If this is a read-only (based on SSA, if first encountered as read), and this is
2305				 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2306				 * loop, however, in that case, you need to prevent read-only gets reused for the
2307				 * output tensor, which is not obvious how to implement correctly), and it is not
2308				 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2309				 * of memory anyway (because on second loop, we want to read the same value out).
2310				 * Mark it to the end of the graph. */
2311				if (p_node_info && !tensor_symbol_info[d].assign_ref)
2312					for (j = 0; j < destination_size; j++)
2313					{
2314						// If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2315						const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2316						if (cell.i32 && cell.i32[0] > 0)
2317							_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2318					}
2319			}
2320			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2321		}
2322		for (i = 0; i < node->output_size; i++)
2323		{
2324			int d = node->outputs[i];
2325			if (d < 0)
2326				continue;
2327			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2328				d = tensor_symbol_info[d].alias_ref - 1;
2329			tensor_blocks[d].flags |= WRITE_ONLY;
2330			if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2331				continue;
2332			assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2332, __extension__ __PRETTY_FUNCTION__
); }));
2333			_ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2334		}
2335	} ccv_nnc_graph_visit_endfor} }
2336	// For any assign_ref, its life-time kept until the end and wrap over.
2337	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2338		// If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2339		// that "somewhere else" need to keep its life-time til the end.
2340		if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) &&
2341			p_node_info && tensor_symbol_info[i].assign_ref)
2342		{
2343			const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2344			for (j = 0; j < destination_size; j++)
2345			{
2346				// This logic is to be more conservative about which destination we add to.
2347				// As of now, if we add everything, it is fine most likely. However, it may
2348				// cause issues in the future to do so naively. Thus, instead, we only add
2349				// the destination to it iff either the tensor is not used at all, or, the
2350				// destination is on the same stream as of the tensor block some way.
2351				int flag = !tensor_blocks[assign_ref].tail;
2352				for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2353				{
2354					const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
 + (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)));
2355					const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2356					flag = (cell.i32 && cell.i32[0] > 0);
2357				}
2358				if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2359					_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2360			}
2361		}
2362	for (i = 0; i < output_size; i++)
2363	{
2364		assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
 __assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2364, __extension__ __PRETTY_FUNCTION__); }));
2365		int d = outputs[i].d;
2366		if (d < 0)
2367			continue;
2368		if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2369			d = tensor_symbol_info[d].alias_ref - 1;
2370		if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2371			continue;
2372		assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2372, __extension__ __PRETTY_FUNCTION__
); }));
2373		for (j = 0; j < destination_size; j++)
2374		{
2375			int flag = !tensor_blocks[d].tail;
2376			for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2377			{
2378				const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
2379				const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2380				flag = (cell.i32 && cell.i32[0] > 0);
2381			}
2382			if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2383				_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2384		}
2385	}
2386	// Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2387	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2388		int x, y;
2389		for (x = 0; x < node->input_size; x++)
2390			for (y = 0; y < node->output_size; y++)
2391				/* Some operations enforces some tensors to be the same for inputs / outputs. */
2392				if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2393				{
2394					// If both unassigned, it is fine.
2395					if (node->inputs[x] < 0 && node->outputs[y] < 0)
2396						continue;
2397					int ref = node->inputs[x];
2398					assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2398, __extension__ __PRETTY_FUNCTION__); }));
2399					while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2400						ref = tensor_blocks[ref].ref - 1;
2401					const int node_output_y = node->outputs[y];
2402					assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
 ({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2402, __extension__ __PRETTY_FUNCTION__
); }));
2403					// If both are not computable, it is fine, we don't need to enforce.
2404					if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2405						!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
)))
2406						continue;
2407					// Otherwise, enforce and error out if failed.
2408					if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2409						{ assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2409, __extension__ __PRETTY_FUNCTION__
); })); }
2410				}
2411	} ccv_nnc_graph_visit_endfor} }
2412	// Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2413	// we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2414	// that is not enforced in-place (because the tensor enforced in-place will be different than the
2415	// binding one).
2416	for (i = 0; i < tensor_bind_size; i++)
2417	{
2418		const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2419		// If there is a tensor binded, then it is unassigned.
2420		if (resolved_symbol.d >= 0)
2421		{
2422			int d = resolved_symbol.d;
2423			// I cannot assert too much at this moment.
2424			if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2425				d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2426			// This check is for in-place ops. Only in-place op could have unassigned but ref.
2427			// It has nothing to do with alias.
2428			while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2429				d = tensor_blocks[d].ref - 1;
2430			// Doesn't work if this is a loop carrying variable.
2431			assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
 __extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
 __assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2431, __extension__ __PRETTY_FUNCTION__); }));
2432			tensor_blocks[d].flags = UNASSIGNED;
2433			tensor_blocks[d].ref = 0; // No need to have ref as well.
2434		}
2435	}
2436	// Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2437	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2438		int x, y;
2439		for (x = 0; x < node->input_size; x++)
2440		{
2441			/* If the input is not assigned, it can be referenced, find the referenced one */
2442			int ref = node->inputs[x];
2443			if (ref < 0)
2444				continue;
2445			const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2446			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
2447				ref = tensor_blocks[ref].ref - 1;
2448			assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
 ({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2448, __extension__ __PRETTY_FUNCTION__
); }));
2449			if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) &&
2450				tensor_blocks[ref].tail->rnum == 1)
2451			{
2452				for (y = 0; y < node->output_size; y++)
2453					/* Only proceed if the input symbol is different from the output symbol, */
2454					/* and the input symbol meets the output symbol exactly at the same spot. */
2455					if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2456						node->outputs[y] >= 0 &&
2457						ref != node->outputs[y] &&
2458						TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
 0x3) == UNASSIGNED)))
2459					{
2460						const int node_output_y = node->outputs[y];
2461						const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2462						/* If dimension matches perfectly, then we can assign y_symbol to x.
2463						 * If both of them are aliases, making sure their origin matches in size too. */
2464						if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
2465						{
2466							_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2467							// This refers to an alias itself, now mark it and will be processed later.
2468							if (ref != node->inputs[x])
2469								tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2470						}
2471					}
2472			}
2473		}
2474	} ccv_nnc_graph_visit_endfor} }
2475	// Specifically handle the bypass. This need to be done after the first pass.
2476	// I need to extend the bypass life-time to the same as the one I am going with.
2477	// It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2478	ccv_nnc_tensor_block_t empty_block = {};
2479	empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2480	empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2481	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2482		if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2483		{
2484			int can_bypass = 1;
2485			for (i = 0; can_bypass && i < node->output_size; i++)
2486			{
2487				int d = node->outputs[i];
2488				if (d < 0)
2489					continue;
2490				if (!tensor_blocks[d].bypass_ref)
2491					continue;
2492				while (tensor_blocks[d].ref)
2493					d = tensor_blocks[d].ref - 1;
2494				int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2495				while (tensor_blocks[bypass_ref].ref)
2496					bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2497				// If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2498				if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2499					continue;
2500				ccv_array_clear(empty_block.head);
2501				for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2502					ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
 + (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j))));
2503				ccv_array_clear(empty_block.tail);
2504				for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2505					ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
 + (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j))));
2506				for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2507					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), empty_block);
2508				for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2509					_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), empty_block);
2510				// It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2511				assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
 ({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
 tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
 ("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2511, __extension__ __PRETTY_FUNCTION__
); }));
2512				int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2513				while (tensor_blocks[b_ref].ref)
2514					b_ref = tensor_blocks[b_ref].ref - 1;
2515				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2516				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2517				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2518				// even after we extend the life-time of bypass_ref. Then we are in a good shape.
2519				can_bypass = can_bypass && (a_hop_b || b_hop_a);
2520			}
2521			if (can_bypass)
2522			{
2523				for (i = 0; i < node->output_size; i++)
2524				{
2525					int d = node->outputs[i];
2526					if (d < 0)
2527						continue;
2528					if (!tensor_blocks[d].bypass_ref)
2529						continue;
2530					while (tensor_blocks[d].ref)
2531						d = tensor_blocks[d].ref - 1;
2532					int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2533					while (tensor_blocks[bypass_ref].ref)
2534						bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2535					// The bypass_ref can extend its life-time.
2536					for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2537						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2538					for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2539						_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j))), tensor_blocks[bypass_ref]);
2540				}
2541			} else {
2542				for (i = 0; i < node->output_size; i++)
2543					tensor_blocks[node->outputs[i]].bypass_ref = 0;
2544				const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2545				// Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2546				exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2547			}
2548		}
2549	} ccv_nnc_graph_visit_endfor} }
2550	ccv_array_free(empty_block.head);
2551	ccv_array_free(empty_block.tail);
2552	*r_exec_dep = exec_dep;
2553	*r_tensor_blocks = tensor_blocks;
2554}
2555 
2556static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2557{
2558	if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2559	{
2560		ccv_nnc_cmd_t retval = cmd;
2561		retval.cmd = CCV_NNC_NOOP;
2562		return retval;
2563	}
2564	return cmd;
2565}
2566 
2567static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2568{
2569	if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2570	{
2571		if (tensor_symbol_info[input].alias_ref)
2572		{
2573			const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2574			assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2574, __extension__ __PRETTY_FUNCTION__
); }));
2575			ccv_nnc_tensor_symbol_t tensor_symbol = {};
2576			if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2577			{
2578				tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2579				if (tensor_symbol_info[alias_ref].pair_ref)
2580					ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2581						.d = tensor_symbol_info[alias_ref].pair_ref - 1,
2582						.graph = dup_graph->pair
2583					});
2584				ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2585				dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2586			} else {
2587				tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2588				tensor_symbol.graph = dup_graph;
2589			}
2590			ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2591			if (tensor_symbol_info[input].pair_ref)
2592				ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2593					.d = tensor_symbol_info[input].pair_ref - 1,
2594					.graph = dup_graph->pair
2595				});
2596			ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2597			dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2598		} else {
2599			ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2600			if (tensor_symbol_info[input].pair_ref)
2601				ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2602					.d = tensor_symbol_info[input].pair_ref - 1,
2603					.graph = dup_graph->pair
2604				});
2605			ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2606			dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2607		}
2608		if (tensor_symbol_info[input].bypass_ref)
2609		{
2610			const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2611			assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
 ({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2611, __extension__ __PRETTY_FUNCTION__
); }));
2612			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])));
2613			symbol_info->bypass_ref = dup_bypass_ref + 1;
2614		}
2615	}
2616	return (ccv_nnc_tensor_symbol_t) {
2617		.d = dup_tensor_block_ref[input * unroll_count],
2618		.graph = dup_graph,
2619	};
2620}
2621 
2622static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2623{
2624	int i;
2625	if (dup_exec_ref[idx * unroll_count] < 0)
2626	{
2627		// Input has to come before output, because output could has a bypass reference to the input.
2628		for (i = 0; i < node->input_size; i++)
2629			max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2630		for (i = 0; i < node->output_size; i++)
2631			max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2632		ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2633		dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2634	}
2635	return (ccv_nnc_graph_exec_symbol_t) {
2636		.d = dup_exec_ref[idx * unroll_count],
2637		.graph = dup_graph,
2638	};
2639}
2640 
2641static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2642{
2643	int i;
2644	for (i = 0; i < tensor_block_size; i++)
2645	{
2646		if (tensor_blocks[i].head)
2647			ccv_array_free(tensor_blocks[i].head);
2648		if (tensor_blocks[i].tail)
2649			ccv_array_free(tensor_blocks[i].tail);
2650		if (tensor_blocks[i].r_refs)
2651			ccv_array_free(tensor_blocks[i].r_refs);
2652		if (tensor_blocks[i].dup_p_refs)
2653			ccv_array_free(tensor_blocks[i].dup_p_refs);
2654	}
2655	ccfreefree(tensor_blocks);
2656}
2657 
2658// Find tensors that cannot be solved by co-allocating to the same location.
2659static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2660{
2661	int i, j, unroll_count = 0;
2662	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2663		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2664		{
2665			// This is is a parameter, thus, it has to be either an alias or used.
2666			assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
 & 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
 ("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2666, __extension__ __PRETTY_FUNCTION__
); }));
2667			const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2668			// The parameter it assign to has to be either an alias or used.
2669			assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2669, __extension__ __PRETTY_FUNCTION__
); }));
2670			// If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2671			// If it is the same, we are good, no need to extend.
2672			int a_ref = i;
2673			while (tensor_blocks[a_ref].ref)
2674				a_ref = tensor_blocks[a_ref].ref - 1;
2675			int b_ref = assign_ref;
2676			while (tensor_blocks[b_ref].ref)
2677				b_ref = tensor_blocks[b_ref].ref - 1;
2678			if (a_ref != b_ref)
2679			{
2680				// If any of the b's head is deterministically later than a's tail
2681				// or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2682				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2683				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2684				// It cannot be that both i can hop to j can j can hop to i.
2685				assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
 ? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
 > 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2685, __extension__ __PRETTY_FUNCTION__
); }));
2686				// Can it be folded
2687				// These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2688				if (a_hop_b || b_hop_a)
2689				{
2690					tensor_blocks[a_ref].companion_ref = b_ref + 1;
2691					tensor_blocks[b_ref].companion_ref = a_ref + 1;
2692					continue;
2693				}
2694				int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2695				for (j = 0; c_ref >= 0; j++)
2696				{
2697					while (tensor_blocks[c_ref].ref)
2698						c_ref = tensor_blocks[c_ref].ref - 1;
2699					c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2700				}
2701				unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
 = (j + 1); (_a > _b) ? _a : _b; });
2702			}
2703		}
2704	// Reset companion_ref if need to unroll.
2705	if (unroll_count)
2706		for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2707			tensor_blocks[j].companion_ref = 0;
2708	return unroll_count;
2709}
2710 
2711static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2712{
2713	int i, j, n;
2714	// The inout exec nodes, these are the nodes we are going to extend.
2715	uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2716	int max_input_size = 0;
2717	int max_output_size = 0;
2718	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2719	{
2720		max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; });
2721		max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; });
2722	}
2723	ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
2724	ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
2725	// Doing graph expansion
2726	// It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2727	assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2727, __extension__ __PRETTY_FUNCTION__
); }));
2728	assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
 0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2728, __extension__ __PRETTY_FUNCTION__
); }));
2729#define INCOMING_NODE (1)
2730#define OUTGOING_NODE (2)
2731	// Unroll the graph n times.
2732	for (n = 0; n < unroll_count; n++)
2733	{
2734		int* const dup_exec_ref = r_dup_exec_ref + n;
2735		const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2736		int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2737		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2738			dup_exec_ref[i * unroll_count] = -1;
2739		for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2740		{
2741			// If there is a assign_ref, that means I don't need to dup the tensor.
2742			if (tensor_symbol_info[i].assign_ref)
2743			{
2744				const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2745				dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2746			} else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED)) && TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2747			// If this is a read-only tensor block, no need to duplicate because the value never changes
2748			// (note we handled assign_ref first), therefore, no need to generate duplicate.
2749				dup_tensor_block_ref[i * unroll_count] = i;
2750			else
2751				dup_tensor_block_ref[i * unroll_count] = -1;
2752		}
2753		// Go through the original graph, make copies of the node if it is inout.
2754		ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
2755			ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2756			inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2757			if (!node->outgoings)
2758				continue;
2759			for (i = 0; i < node->outgoings->rnum; i++)
2760			{
2761				const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
2762				inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2763				ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2764				ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2765			}
2766		} ccv_nnc_graph_visit_endfor} }
2767		// Check the visitor are all marked as either incoming or outgoing.
2768		const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2769		const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2770		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2771		{
2772			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2773				continue;
2774			assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
 OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
 INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
 ("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2774, __extension__ __PRETTY_FUNCTION__
); }));
2775			// If this is pure incoming nodes, then I need to concat this one with all original destination node
2776			if (inout[i] == INCOMING_NODE)
2777				for (j = 0; j < dup_destination_size; j++)
2778				{
2779					ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2780						.d = dup_destinations[j].d,
2781						.graph = dup_graph,
2782					}, (ccv_nnc_graph_exec_symbol_t) {
2783						.d = dup_exec_ref[i * unroll_count],
2784						.graph = dup_graph,
2785					});
2786				}
2787		}
2788		if (dup_graph->destinations)
2789			ccv_array_clear(dup_graph->destinations);
2790		for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2791		{
2792			if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2793				continue;
2794			const int d = dup_exec_ref[i * unroll_count];
2795			ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
 + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)));
2796			// If this has no outgoing node, add to the destination.
2797			if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2798				ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2799					.graph = dup_graph,
2800					.d = d,
2801				});
2802		}
2803	}
2804#undef INCOMING_NODE
2805#undef OUTGOING_NODE
2806	ccfreefree(inout);
2807}
2808 
2809static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2810{
2811	int i;
2812	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2813		// Now can assign them (The dup) as companion.
2814		// Get to the last one, which we will wrap over.
2815		if (dup_tensor_symbol_info[i].assign_ref)
2816		{
2817			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2818			dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2819			assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
 ; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2819, __extension__ __PRETTY_FUNCTION__
); }));
2820			dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2821		}
2822}
2823 
2824// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2825// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2826// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2827static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2828{
2829	int i, j, k;
2830	for (i = 0; i < p_node_info->output_size; i++)
2831	{
2832		const int d = p_node_info->outputs[i];
2833		const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
 (size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx))) - 1;
2834		if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
 !((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED)))
2835			continue;
2836		for (k = 0; k < destination_size; k++)
2837			_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2838		// Add the duplicated destinations to the tensor_block_ref.
2839		for (j = 0; j < unroll_count; j++)
2840			for (k = 0; k < destination_size; k++)
2841			{
2842				const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2843				const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2844				if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2845					_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2846			}
2847	}
2848}
2849 
2850static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2851{
2852	int i, j;
2853	ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2854	ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2855	// blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2856	// Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2857	// No need to change anything, we are good.
2858	const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2859	if (!unroll_count)
2860		return;
2861	// Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2862	// Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2863	ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2864	int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2865	int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2866	_ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2867	ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2868	ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2869	ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
(dup_graph->exec_symbol_info->rnum) + _incoming_edges_ >
 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
 = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = {
 (dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1
; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for
 (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
 _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue
; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r =
 1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) {
 ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph
->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1
; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_
[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((
_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum
)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->destinations->rnum); _exist_size_[1] =
 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
 const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
 for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
 == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
 = 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
 int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
 6 && _d_ < (dup_graph->destinations->rnum))
 { _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
 < ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 6 && _d_ <
 (dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0
), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->
rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
 ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
 ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
 ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
 (size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
 <= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
 ({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2869, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
2870	ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2871	_ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2872	// Free out the old exec_dep
2873	ccv_matrix_free(exec_dep);
2874	// and the tensor blocks, prepare for the new.
2875	_ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2876	// A reverse map to find where the original tensor comes from.
2877	int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2878	for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2879		dup_tensor_from_ref[i] = -1;
2880	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2881		for (j = 0; j < unroll_count; j++)
2882			if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2883				dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2884	int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2885	for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2886		dup_exec_from_ref[i] = -1;
2887	for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2888	{
2889		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2890			continue;
2891		dup_exec_from_ref[i] = i; // Reference back.
2892		for (j = 0; j < unroll_count; j++)
2893			if (dup_exec_ref[i * unroll_count + j] >= 0)
2894				dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2895	}
2896	// Reset all attr.
2897	memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2898	_ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0))), dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2899	ccv_nnc_graph_visit_free(dup_visit);
2900	ccfreefree(dup_exec_symbol_info);
2901	ccfreefree(dup_exec_from_ref);
2902	ccfreefree(dup_tensor_from_ref);
2903	// Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2904	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2905		// Loop over all possible duplications to assign dup_p_ref properly.
2906		for (j = 0; j < unroll_count; j++)
2907		{
2908			const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2909			if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2910			{
2911				const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2912				const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2913				if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2914				{
2915					if (!tensor_blocks[dup_idx].dup_p_refs)
2916						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2917					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2918				}
2919				if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2920					continue;
2921				const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2922				const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2923				if (p_ref_1_is_in_or_out == 1)
2924				{
2925					if (!tensor_blocks[dup_idx].dup_p_refs)
2926						tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2927					ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2928				}
2929			}
2930		}
2931	// companion_ref
2932	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2933		// Now can assign them (The dup) as companion.
2934		if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2935		{
2936			// Get to the last one, which we will wrap over.
2937			const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2938			if (assign_ref >= 0)
2939			{
2940				int b_ref = assign_ref;
2941				while (tensor_blocks[b_ref].ref)
2942					b_ref = tensor_blocks[b_ref].ref - 1;
2943				int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2944				int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2945				// It cannot be that both i can hop to j can j can hop to i.
2946				// And it can be hop from one to another now after duplication.
2947				assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
 ({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
 ("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2947, __extension__ __PRETTY_FUNCTION__); }));
2948				tensor_blocks[i].companion_ref = b_ref + 1;
2949				tensor_blocks[b_ref].companion_ref = i + 1;
2950			}
2951		}
2952	ccfreefree(dup_tensor_symbol_info);
2953	// Extend the dup tensor block ref, prepare for future extensions.
2954	dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2955	for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2956		dup_tensor_block_ref[i] = -1;
2957	// Assign out changed properties.
2958	*r_exec_dep = exec_dep;
2959	*r_tensor_blocks = tensor_blocks;
2960	*r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2961	*r_dup_graph = dup_graph;
2962	*r_unroll_count = unroll_count;
2963	*r_dup_exec_ref = dup_exec_ref;
2964	*r_dup_tensor_block_ref = dup_tensor_block_ref;
2965}
2966 
2967static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2968{
2969	if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
2970		return tensor_block_size;
2971	int i;
2972	const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
2973	int found_idx = tensor_block_size;
2974	for (i = 0; i < anonymous_block_free_list_cap; i++)
2975	{
2976		const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)));
2977		assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
 ({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 2977, __extension__ __PRETTY_FUNCTION__
); }));
2978		// If the type doesn't match, ignore.
2979		if (tensor_blocks[idx].type != type)
2980			continue;
2981		// Heuristic about how to select the best tensor block to move forward.
2982		// If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
2983		if (tensor_blocks[idx].size >= size)
2984		{
2985			if (no_dup_p_refs)
2986				return idx;
2987			// Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
2988			// then we cannot do better than this, if that is the case, just return.
2989			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
2990				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
2991				return idx;
2992		}
2993		int64_t found_idx_size_diff;
2994		int64_t idx_size_diff;
2995		if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
2996			// Now, compare whether this one or the found_idx one is better.
2997			// At this point, there is no point of comparing the dup_p_refs, we only care about which one
2998			// is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
2999			(found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3000		{
3001			found_idx = idx;
3002			continue;
3003		}
3004		// No need to update if found_idx is better than idx.
3005		if (found_idx_size_diff > idx_size_diff)
3006			continue;
3007		// We bias towards the bigger one in case of similar.
3008		if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3009		{
3010			found_idx = idx;
3011			continue;
3012		}
3013		assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
 == tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 3013, __extension__ __PRETTY_FUNCTION__
); }));
3014		// On a tie, check which one has tighter life-cycle.
3015		if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3016		{
3017			// Check whether the current tensor blocks life-cycle is longer than the previous one.
3018			if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3019				(!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3020				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3021				found_idx = idx;
3022			continue;
3023		}
3024		// Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3025		// We prefer to choose the one that has life-cycle closer to the expected ones.
3026		if (no_dup_p_refs)
3027		{
3028			// Whoever is shorter wins.
3029			if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3030				(!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3031				 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3032				found_idx = idx;
3033			continue;
3034		}
3035		if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3036			continue;
3037		if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3038		{
3039			found_idx = idx;
3040			continue;
3041		}
3042		// If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3043		const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3044		const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3045		if (idx_after_request && found_idx_after_request)
3046		{
3047			if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3048				found_idx = idx;
3049			continue;
3050		} else {
3051			// We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3052			// If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3053			// Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3054			if (!found_idx_after_request && (idx_after_request ||
3055				_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3056				found_idx = idx;
3057			continue;
3058		}
3059	}
3060	return found_idx;
3061}
3062 
3063static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3064{
3065	if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3066		return 0;
3067	int i, j, k;
3068	int input_size = 0;
3069	for (i = 0; i < p_node_info->p_while.input_size; i++)
3070		if (p_node_info->p_while.inputs[i] >= 0)
3071			++input_size;
3072	// If doesn't have tensor inputs (thus, only special inputs), just return.
3073	if (!input_size)
3074		return 0;
3075	ccv_nnc_tensor_symbol_t inputs[input_size];
3076	input_size = 0;
3077	for (i = 0; i < p_node_info->p_while.input_size; i++)
3078		if (p_node_info->p_while.inputs[i] >= 0)
3079			inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3080				.d = p_node_info->p_while.inputs[i],
3081				.graph = symbolic_graph,
3082			};
3083	assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
 1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
 > 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3083, __extension__ __PRETTY_FUNCTION__
); }));
3084	ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3085	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3086	for (i = 0; i < symbolic_graph->breakpoint_size; i++)
3087	{
3088		// Make a noop copy of the breakpoint, but with some tensor inputs.
3089		ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
3090		ccv_array_push(dup_breakpoints, &noop);
3091		// Connect this noop to the outgoing nodes of breakpoints.
3092		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(symbolic_graph->breakpoints[i].d)));
3093		if (symbol_info->outgoings)
3094			for (j = 0; j < symbol_info->outgoings->rnum; j++)
3095			{
3096				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3097				ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3098					.d = d,
3099					.graph = symbolic_graph,
3100				});
3101			}
3102	}
3103	for (i = 0; i < exec_symbol_info_size; i++)
3104	{
3105		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
3106		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
3107			continue;
3108		if (symbol_info->outgoings)
3109		{
3110			const int outgoing_size = symbol_info->outgoings->rnum;
3111			for (j = 0; j < outgoing_size; j++)
3112			{
3113				const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)));
3114				for (k = 0; k < symbolic_graph->breakpoint_size; k++)
3115					if (d == symbolic_graph->breakpoints[k].d)
3116					{
3117						ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)));
3118						ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3119							.d = i,
3120							.graph = symbolic_graph,
3121						}, noop);
3122						// Found, connected, exit.
3123						break;
3124					}
3125			}
3126		}
3127	}
3128	// Add the dup_breakpoints to source if neccessary.
3129	assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3129, __extension__ __PRETTY_FUNCTION__
); }));
3130	const int source_size = symbolic_graph->sources->rnum;
3131	for (i = 0; i < source_size; i++)
3132	{
3133		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i))))->d;
3134		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3135			if (d == symbolic_graph->breakpoints[j].d)
3136			{
3137				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3138				ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3139				// Found, made, exit.
3140				break;
3141			}
3142	}
3143	// Add the dup_breakpoints to destination if neccessary.
3144	assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3144, __extension__ __PRETTY_FUNCTION__); }));
3145	const int destination_size = symbolic_graph->destinations->rnum;
3146	for (i = 0; i < destination_size; i++)
3147	{
3148		const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i))))->d;
3149		for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3150			if (d == symbolic_graph->breakpoints[j].d)
3151			{
3152				ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)));
3153				ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3154				// Found, made, exit.
3155				break;
3156			}
3157	}
3158	return dup_breakpoints;
3159}
3160 
3161// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3162static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3163{
3164	assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3164, __extension__ __PRETTY_FUNCTION__
); }));
3165	assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
 ({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3165, __extension__ __PRETTY_FUNCTION__
); }));
3166	// First, fill all the "auto" holes.
3167	// This is the symbol table that with "auto" info filled up.
3168	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3169	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3170	ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3171	ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3171, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3171, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3171, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3171, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
3172	ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3173	int i, j, k, p, q;
3174	const ccv_nnc_graph_exec_symbol_info_t* const  p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
3175	ccv_sparse_matrix_t* exec_dep;
3176	ccv_nnc_tensor_block_t* tensor_blocks;
3177	_ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3178	int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3179	// Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3180	// are automatically filled in, and all the sub-graphs are processed.
3181	// There is a last step though, for a while loop, it is parameterized:
3182	// while (x > 5) {
3183	//     y = x + 1;
3184	// } (y => x) // This means after this loop is done, y's value will be copied over to x.
3185	// we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3186	// If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3187	// it is a inplace operation.
3188	// But if y cannot be x's alias, for example, this while loop looks like this:
3189	// while (x > 5) {
3190	//     y = x + a
3191	//     b = x + y
3192	// } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3193	// For this example, y cannot be x's alias because x is used later to compute b (and that computation
3194	// has dependency on y as well).
3195	// For this case, we need to modify the computation graph. Previously, the graph looks like this:
3196	// y = x + a -> b = x + y
3197	// This graph will be extended to look like this:
3198	// y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3199	// while (x0 > 5) {
3200	//     y0 = x0 + a0
3201	//     b0 = x0 + y0
3202	//     if (y0 > 5) break
3203	//     y1 = y0 + b0
3204	//     b1 = y0 + y1
3205	// } (y1 => x0, b1 => a0)
3206	// After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3207	// with each other now).
3208	// With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3209	// which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3210	ccv_nnc_symbolic_graph_t* dup_graph = 0;
3211	int* dup_exec_ref = 0;
3212	int* dup_tensor_block_ref = 0;
3213	int unroll_count = 0;
3214	// In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3215	ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
3216	prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3217	prep->flags = 0;
3218	// Cannot handle dup a node that is a graph as well.
3219	if (p_exec_symbol_info)
3220	{
3221		prep->flags = p_node_info->flags;
3222		if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3223		{
3224			_ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3225			_ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0))), symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3226		} else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3227			// TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3228		}
3229	}
3230	ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
3231	ccv_array_t* anonymous_block_free_list = 0;
3232	const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3233	// Record whether this tensor is folded in this round.
3234	uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
3235	ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
 int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx; {
3236		for (p = 0; p < node->graph_ref_size; p++)
3237		{
3238			assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 3238, __extension__ __PRETTY_FUNCTION__); }));
3239			ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)));
3240			ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3241			ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0))), sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0))), sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3242			sub_prep->dup_breakpoints = dup_breakpoints;
3243			sub_prep->p = prep;
3244			sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1] = sub_prep;
3245			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3246			const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3247			for (i = 0; i < s_alloc_prep->block_size; i++)
3248			{
3249				const int block_ref = s_alloc_prep->blocks[i].block_ref;
3250				const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3251				if (block_ref < sub_prep->tensor_symbol_info_size)
3252				{
3253					// If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3254					// I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3255					if (s_tensor_blocks[block_ref].bypass_ref)
3256					{
3257						int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3258						while (s_tensor_blocks[bypass_ref].ref)
3259							bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3260						if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3261							s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3262							continue;
3263					}
3264					if (s_tensor_blocks[block_ref].p_refs[0])
3265					{
3266						/* If it is already properly assigned, next. */
3267						if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3268							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3269						{
3270							if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3271								s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3272							else {
3273								assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3273, __extension__ __PRETTY_FUNCTION__
); }));
3274								s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3275							}
3276						}
3277						/* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3278						if (s_tensor_blocks[block_ref].p_refs[1] &&
3279							s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3280							s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3281						{
3282							assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3282, __extension__ __PRETTY_FUNCTION__
); }));
3283							assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3283, __extension__ __PRETTY_FUNCTION__
); }));
3284							s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3285						}
3286					}
3287				} else if (s_tensor_blocks[block_ref].dup_p_refs) {
3288					/* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3289					 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3290					 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3291					 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3292					 * its life-time to the end of the output tensor. */
3293					if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3294						s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3295					for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3296						ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j))));
3297				}
3298			}
3299		}
3300		const int init_tensor_block_size = tensor_block_size;
3301		int rw_anonymous_buffer_size_cap = 0;
3302		int ro_anonymous_buffer_size_cap = 0;
3303		if (anonymous_block_free_list)
3304			ccv_array_clear(anonymous_block_free_list);
3305		memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3306		for (p = 0; p < node->graph_ref_size; p++)
3307		{
3308			ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[p] - 1];
3309			const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3310			int rw_anonymous_buffer_size = 0;
3311			int ro_anonymous_buffer_size = 0;
3312			for (i = 0; i < s_alloc_prep->buffer_size; i++)
3313				if (s_alloc_prep->buffers[i].p_refs[0])
3314				{
3315					/* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3316					int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3317					/* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3318					int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3319					assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3319, __extension__ __PRETTY_FUNCTION__
); }));
3320					int unref_p_ref_0 = p_ref_0;
3321					while (tensor_blocks[unref_p_ref_0].ref)
3322						unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3323					/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3324					assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3324, __extension__ __PRETTY_FUNCTION__); }));
3325					if (s_alloc_prep->buffers[i].p_refs[1])
3326					{
3327						int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3328						const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3329						assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
 ({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3329, __extension__ __PRETTY_FUNCTION__
); }));
3330						int unref_p_ref_1 = p_ref_1;
3331						while (tensor_blocks[unref_p_ref_1].ref)
3332							unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3333						/* See above comment for the similar p_ref_0 check. */
3334						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3334, __extension__ __PRETTY_FUNCTION__); }));
3335						assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3335, __extension__ __PRETTY_FUNCTION__
); }));
3336						int p_ref_t;
3337						if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3338						{
3339							CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
));
3340							CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t));
3341						}
3342						p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3343						/* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3344						if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
3345						{
3346							const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3347							if (folded)
3348							{
3349								p_ref_0 = p_ref_1;
3350								unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3351								tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3352								for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3353								{
3354									const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3355									assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3355, __extension__ __PRETTY_FUNCTION__
); }));
3356								}
3357							}
3358						}
3359					}
3360					/* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3361					 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3362					 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3363					 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3364					 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3365					 * associated with it, then we are good. */
3366					if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3367						(p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3368						(p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3369						TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3370					{
3371						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3372							{ assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3372, __extension__ __PRETTY_FUNCTION__
); })); }
3373						/* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3374						 * is a long argument why that is the case, the digest is, it is much easier to control your output
3375						 * than your input). */
3376						s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3377						s_alloc_prep->buffers[i].p_refs[1] = 0;
3378						/* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3379						assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
 ("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3379, __extension__ __PRETTY_FUNCTION__); }));
3380						tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
 : _b; });
3381						for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3382							tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3383								tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3384									tensor_blocks[unref_p_ref_0].size;
3385					} else {
3386						s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3387						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3388							++ro_anonymous_buffer_size;
3389						else
3390							rw_anonymous_buffer_size += unroll_count + 1;
3391					}
3392				} else {
3393					if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3394						++ro_anonymous_buffer_size;
3395					else
3396						rw_anonymous_buffer_size += unroll_count + 1;
3397				}
3398			if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3399			{
3400				const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3401				// All read-write buffer (potentially) can be reused between each case..of branch.
3402				rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3403				// Read-only buffer cannot be reused between each case..of branch.
3404				ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3405				/* Anonymous block, allocate additional tensor blocks for this. */
3406				/* This is either because this is an internal tensor (don't have p_ref) */
3407				/* or it is an anonymous block itself within the sub graphs of this while graph. */
3408				tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3409				memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3410				if (dup_tensor_block_ref)
3411					dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3412				for (i = 0; i < s_alloc_prep->buffer_size; i++)
3413					if (!s_alloc_prep->buffers[i].p_refs[0])
3414					{
3415						if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3416						{
3417							assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size
 + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap"
, "ccv_nnc_symbolic_graph_compile.c", 3417, __extension__ __PRETTY_FUNCTION__
); }));
3418							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0x10) | ANONYMOUS));
3419							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3420							tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3421							tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3422							tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3423							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3424							tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3425							ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3426							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3427							if (dup_p_refs && dup_p_refs->rnum > 0)
3428							{
3429								for (j = 0; j < dup_p_refs->rnum; j++)
3430								{
3431									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3432									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3432, __extension__ __PRETTY_FUNCTION__
); }));
3433									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3433, __extension__ __PRETTY_FUNCTION__
); }));
3434									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3434, __extension__ __PRETTY_FUNCTION__); }));
3435									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3436									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3437									if (tensor_symbol_info[dup_p_ref].p_ref)
3438									{
3439										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3440										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3440, __extension__ __PRETTY_FUNCTION__); }));
3441										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3442										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3443										{
3444											if (!tensor_blocks[tensor_block_size].dup_p_refs)
3445												tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3446											ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3447										}
3448									}
3449									if (!tensor_blocks[tensor_block_size].tail)
3450										tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3451									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3452										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_size]);
3453								}
3454							} else {
3455								tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3456								ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3457							}
3458							for (j = 0; j < source_size; j++)
3459								_ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3460							/* If this is a read-only (based on SSA, if first encountered as read), and this is
3461							 * sub-graph. Mark it to the end of the graph. */
3462							if (p_exec_symbol_info)
3463								for (j = 0; j < destination_size; j++)
3464									_ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3465							/* If it is read-only, it is self-reflecting. */
3466							for (k = 0; k < unroll_count; k++)
3467							{
3468								for (j = 0; j < destination_size; j++)
3469									if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3470									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3471								/* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3472								assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
 ({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3472, __extension__ __PRETTY_FUNCTION__
); }));
3473								dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3474							}
3475							++tensor_block_size;
3476						} else {
3477							ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3478							const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3479							const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3480							// Find suitable tensor block from the free list.
3481							TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3482							TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3483							s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3484							if (new_anonymous_tensor_block)
3485							{
3486								tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3487								tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3488								tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3489								tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3490								ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3491							} else {
3492								tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3493								tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3494							}
3495							if (dup_p_refs && dup_p_refs->rnum > 0)
3496							{
3497								for (j = 0; j < dup_p_refs->rnum; j++)
3498								{
3499									const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3500									assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3500, __extension__ __PRETTY_FUNCTION__
); }));
3501									assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3501, __extension__ __PRETTY_FUNCTION__
); }));
3502									// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3503									// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3504									if (tensor_symbol_info[dup_p_ref].p_ref)
3505									{
3506										const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3507										assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3507, __extension__ __PRETTY_FUNCTION__); }));
3508										const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3509										if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3510										{
3511											if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3512												tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3513											ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3514										}
3515									}
3516									assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
 ({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3516, __extension__ __PRETTY_FUNCTION__); }));
3517									if (!tensor_blocks[tensor_block_idx].tail)
3518										tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3519									for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3520										_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
 (size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k))), tensor_blocks[tensor_block_idx]);
3521									// We have to add it to the warp around companion_ref as well.
3522									// TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3523									// be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3524									// definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3525									// gaurantee may be broken down in the line.
3526									if (tensor_blocks[dup_p_ref].companion_ref)
3527									{
3528										const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3529										for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3530											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3531										for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3532											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3533									}
3534								}
3535							} else if (new_anonymous_tensor_block) {
3536								tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3537								ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3538							}
3539							const int prev_tensor_block_idx = tensor_block_idx;
3540							if (new_anonymous_tensor_block)
3541							{
3542								if (!anonymous_block_free_list)
3543									anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3544								ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3545								++tensor_block_size;
3546							}
3547							for (k = 0; k < unroll_count; k++)
3548							{
3549								const int tensor_block_idx = new_anonymous_tensor_block ?
3550									(dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3551									dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3552								TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS));
3553								TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
 0xc)));
3554								if (new_anonymous_tensor_block)
3555								{
3556									tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3557									tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3558									tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3559									tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3560									/* Attach to duplicated exec for this tensor block. */
3561									ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3562								} else {
3563									tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3564									tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
 _a : _b; });
3565									_ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3566 
3567								}
3568								if (dup_p_refs && dup_p_refs->rnum > 0)
3569								{
3570									/* Not nil, not self-reflecting. */
3571									for (j = 0; j < dup_p_refs->rnum; j++)
3572									{
3573										const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)));
3574										assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3574, __extension__ __PRETTY_FUNCTION__
); }));
3575										assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3575, __extension__ __PRETTY_FUNCTION__
); }));
3576										// If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3577										// this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3578										if (tensor_symbol_info[dup_p_ref].p_ref)
3579										{
3580											const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3581											assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3581, __extension__ __PRETTY_FUNCTION__); }));
3582											const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3583											if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3584											{
3585												if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3586													tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3587												ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3588											}
3589										}
3590										assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
 + k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
 ("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3590, __extension__ __PRETTY_FUNCTION__
); }));
3591										const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3592										assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
 __extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
 __assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3592, __extension__ __PRETTY_FUNCTION__); }));
3593										if (!tensor_blocks[tensor_block_idx].tail)
3594											tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3595										for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3596											_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3597										// We have to add it to the warp around companion_ref as well.
3598										if (tensor_blocks[dup_dup_p_ref].companion_ref)
3599										{
3600											const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3601											for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3602												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3603											for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3604												_ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q))), tensor_blocks[tensor_block_idx]);
3605										}
3606									}
3607								} else if (new_anonymous_tensor_block) {
3608									tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3609									ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3610								}
3611								if (new_anonymous_tensor_block)
3612									++tensor_block_size;
3613							}
3614						}
3615					}
3616			}
3617		}
3618	} ccv_nnc_graph_visit_endfor} }
3619	if (anonymous_block_free_list)
3620		ccv_array_free(anonymous_block_free_list);
3621	ccfreefree(tensor_fold);
3622	// It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3623	// the allocation dependencies, thus, which tensor is reused to the existing tensor.
3624	ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3625	prep->while_count_tensor = 0;
3626	prep->dup_breakpoints = 0;
3627	prep->p = 0;
3628	prep->symbolic_graph = symbolic_graph;
3629	prep->p_idx = symbolic_graph->p_idx;
3630	prep->exec_idx = symbolic_graph->exec_idx;
3631	prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3632	prep->sub_preps = sub_preps;
3633	prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3634	prep->exec_symbol_info = exec_symbol_info;
3635	prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3636	prep->tensor_symbol_info = tensor_symbol_info;
3637	prep->unroll_count = unroll_count;
3638	prep->dup_tensor_block_ref = dup_tensor_block_ref;
3639	prep->tensor_block_size = tensor_block_size;
3640	prep->tensor_blocks = tensor_blocks;
3641	prep->exec_flags = exec_flags;
3642	prep->visit = visit;
3643	prep->alloc_prep = alloc_prep;
3644	if (dup_graph)
3645		ccv_nnc_symbolic_graph_free(dup_graph);
3646	if (dup_exec_ref)
3647		ccfreefree(dup_exec_ref);
3648	return prep;
3649}
3650 
3651static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3652{
3653	int i;
3654	_ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3655	ccfreefree(prep->exec_flags);
3656	for (i = 0; i < prep->sub_prep_size; i++)
3657		if (prep->sub_preps[i])
3658			_ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3659	if (prep->sub_preps)
3660		ccfreefree(prep->sub_preps);
3661	ccfreefree(prep->tensor_symbol_info);
3662	ccfreefree(prep->exec_symbol_info);
3663	if (prep->dup_tensor_block_ref)
3664		ccfreefree(prep->dup_tensor_block_ref);
3665	_ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3666	ccv_nnc_graph_visit_free(prep->visit);
3667	ccfreefree(prep);
3668}
3669 
3670static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3671{
3672	int i, j;
3673	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
3674		if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3675		{
3676			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3677			assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3677, __extension__ __PRETTY_FUNCTION__
); }));
3678			ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3679			for (i = 0; i < node->p_while.input_size; i++)
3680				if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3681				{
3682					ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3683					const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3684					for (j = 0; j < d; j++)
3685						prep = prep->p;
3686					prep->while_count_tensor = 1;
3687				}
3688		}
3689		for (i = 0; i < node->graph_ref_size; i++)
3690		{
3691			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3692			if (graph_ref >= 0)
3693				_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3694		}
3695	} ccv_nnc_graph_visit_endfor} }
3696}
3697 
3698static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3699{
3700	if (symbol >= 0)
3701		return graph_prep->tensor_arena->vt_tensors[symbol];
3702	if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3703		return 0;
3704	assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
 : 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
 0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3704, __extension__ __PRETTY_FUNCTION__
); }));
3705	const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3706	int i;
3707	const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3708	for (i = 0; i < d; i++)
3709		prep = prep->p;
3710	assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
 ({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3710, __extension__ __PRETTY_FUNCTION__
); }));
3711	return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3712}
3713 
3714static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3715{
3716	int i;
3717	int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3718	ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3719	graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3720	graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3721	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3722	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3723		if (graph_execs[i].graph == graph)
3724			graph_execs[i].d = exec_cvt[graph_execs[i].d];
3725	ccfreefree(exec_cvt);
3726}
3727 
3728static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3729{
3730	int i, j, k;
3731	ccv_nnc_graph_t* const graph = graph_prep->graph;
3732	const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3733	ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
1
Uninitialized value stored to field 'graph'→
3734	graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3735	graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3736	graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3737	graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3738	memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3739	ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3740	int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3741	for (i = 0; i < exec_symbol_info_size; i++)
2
←
Assuming 'i' is >= 'exec_symbol_info_size'→
3
←
Loop condition is false. Execution continues on line 3750→
3742	{
3743		max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; });
3744		max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; });
3745		if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3746			max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
 : _b; });
3747		graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3748		graph_execs[i].graph = 0;
3749	}
3750	for (i = 0; i < graph_prep->sub_prep_size; i++)
4
←
Assuming 'i' is >= field 'sub_prep_size'→
5
←
Loop condition is false. Execution continues on line 3752→
3751		max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
 ((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; });
3752	ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })];
6
←
'?' condition is true→
3753	ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })];
7
←
'?' condition is true→
3754	ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })];
8
←
'?' condition is true→
3755	const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3756	const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3757	// Create node, this is in topological order.
3758	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
9
←
Assuming '_i_' is < field 'size'→
10
←
Loop condition is true.  Entering loop body→
3759		if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
11
←
The left operand of '==' is a garbage value
3760		{
3761			for (i = 0; i < node->input_size; i++)
3762				max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3763			for (i = 0; i < node->output_size; i++)
3764				max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3765			if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3766			{
3767				const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[0] - 1;
3768				assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3768, __extension__ __PRETTY_FUNCTION__
); }));
3769				ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3770				ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3771				graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3772				const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3773				ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3774				ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3775				for (i = 0; i < node->p_while.input_size; i++)
3776					max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3777				for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3778					max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3779				ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3780				_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3781			} else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3782				for (i = 0; i < node->output_size; i++)
3783					if (max_outputs[i] && max_outputs[i]->alias_ref)
3784						max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3785				graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3786				// Check whether this is already covered in the inputs, if not, need to be covered in the update.
3787				for (i = 0; i < node->case_of.argument.offset; i++)
3788				{
3789					ccv_nnc_tensor_t* const update = max_inputs[i];
3790					if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3791						continue;
3792					int flag = 0;
3793					for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3794						flag = (update == max_inputs[j]);
3795					if (!flag)
3796						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3797				}
3798				const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3799				ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3800				if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3801				{
3802					// Add another graph for data transfer.
3803					ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3804					for (i = 0; i < node->output_size; i++)
3805						max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3806					ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }), max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
 (node->output_size) _b = (node->output_size); (_a <
 _b) ? _a : _b; }));
3807					ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3808					ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3809					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3810					int exec_cvt;
3811					ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3812				}
3813				for (i = 0; i < node->graph_ref_size; i++)
3814				{
3815					const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
3816					if (graph_ref < 0)
3817						continue;
3818					ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3819					const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
 (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)));
3820					ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3821					ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3822					_ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3823				}
3824			} else {
3825				graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3826			}
3827			ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3828		}
3829	} ccv_nnc_graph_visit_endfor} }
3830	// Then connect them.
3831	ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx; {
3832		if (node->outgoings)
3833			for (i = 0; i < node->outgoings->rnum; i++)
3834			{
3835				const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)));
3836				if (graph_execs[outgoing].graph)
3837					ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3838			}
3839	} ccv_nnc_graph_visit_endfor} }
3840	int source_exec_created = 0;
3841	const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3842	const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3843	ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3844	// After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3845	for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
3846	{
3847		if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
)))
3848		{
3849			int ref = i;
3850			while (tensor_symbol_info[ref].alias_ref)
3851				ref = tensor_symbol_info[ref].alias_ref - 1;
3852			while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)) && tensor_blocks[ref].ref)
3853				ref = tensor_blocks[ref].ref - 1;
3854			// This is not computable. It could be that we marked a const tensor as init zero.
3855			if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED)))
3856				continue;
3857			// If this tensor is not used by any exec, we don't need to init at all. Skip.
3858			if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3859				continue;
3860			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3861			// Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3862			ccv_nnc_graph_exec_t set_exec;
3863			if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3864				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={0,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3865			else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3866				set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={1,}}}, 0), ccv_nnc_no_hint, 0, 0, &tensor, 1);
3867			for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
3868			{
3869				const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)));
3870				if (outgoing >= exec_symbol_info_size)
3871					continue;
3872				assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
 if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3872, __extension__ __PRETTY_FUNCTION__
); }));
3873				assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3873, __extension__ __PRETTY_FUNCTION__
); }));
3874				ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3875			}
3876			int flags = 0;
3877			if (alloc_dep[ref])
3878				for (j = 0; j < alloc_dep[ref]->rnum; j++)
3879				{
3880					const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)));
3881					// This is from alloc_dep, it should be computable.
3882					assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
 & 0x3) == ALIAS) && !((tensor_blocks[d].flags &
 0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3882, __extension__ __PRETTY_FUNCTION__
); }));
3883					if (tensor_blocks[d].tail)
3884						for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
3885						{
3886							const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)));
3887							if (incoming >= exec_symbol_info_size)
3888								continue;
3889							assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
 if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3889, __extension__ __PRETTY_FUNCTION__
); }));
3890							assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
 ({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3890, __extension__ __PRETTY_FUNCTION__
); }));
3891							ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3892							flags = 1;
3893						}
3894				}
3895			// If cannot find a start node for this exec, we need to append it to the no-op of the start.
3896			if (!flags)
3897			{
3898				if (!source_exec_created)
3899				{
3900					graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3901					source_exec_created = 1;
3902				}
3903				ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3904			}
3905		}
3906	}
3907	// Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3908	// (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3909	// with its alias).
3910	assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3910, __extension__ __PRETTY_FUNCTION__
); }));
3911	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3912	{
3913		ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3914		// If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3915		if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3916		{
3917			const ccv_array_t* const head = tensor_blocks[i].head;
3918			if (head && head->rnum > 0)
3919				for (j = 0; j < head->rnum; j++)
3920				{
3921					const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
 * (size_t)(j)));
3922					if (idx >= exec_symbol_info_size)
3923						continue;
3924					assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3924, __extension__ __PRETTY_FUNCTION__); }));
3925					const int d = graph_execs[idx].d;
3926					ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)));
3927					int flag = 0;
3928					if (exec_info->tensor_wraps_ref)
3929					{
3930						ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)));
3931						for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3932							flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3933					}
3934					// If none is in the flag, it need to be included in the cast.
3935					if (!flag)
3936						ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3937				}
3938		}
3939	}
3940	// Create source / destination phony node. This is to facilitate use of compiled graph.
3941	// Also, this is needed if you have init zero execs.
3942	if (source_exec_created || source_size > 1)
3943	{
3944		if (!source_exec_created)
3945			graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3946		for (i = 0; i < source_size; i++)
3947			ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3948	} else {
3949		assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
 ({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3949, __extension__ __PRETTY_FUNCTION__
); }));
3950		assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
 if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3950, __extension__ __PRETTY_FUNCTION__
); }));
3951		graph_exec_arena->source = graph_execs[sources[0].d];
3952	}
3953	if (destination_size == 1)
3954		graph_exec_arena->destination = graph_execs[destinations[0].d];
3955	else {
3956		graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3957		for (i = 0; i < destination_size; i++)
3958			ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3959	}
3960	ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3961	ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3962	return graph_exec_arena;
3963}
3964 
3965static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3966{
3967	if (graph_prep->symbolic_graph == pair)
3968		return graph_prep->graph;
3969	int i;
3970	for (i = 0; i < graph_prep->sub_prep_size; i++)
3971		if (graph_prep->sub_preps[i])
3972		{
3973			ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
3974			if (graph)
3975				return graph;
3976		}
3977	return 0;
3978}
3979 
3980static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3981{
3982	int i;
3983	for (i = 0; i < graph_prep->sub_prep_size; i++)
3984		if (graph_prep->sub_preps[i])
3985		{
3986			if (graph_prep->sub_preps[i]->symbolic_graph->pair)
3987				graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
3988		}
3989}
3990 
3991static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3992{
3993	assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3993, __extension__ __PRETTY_FUNCTION__
); }));
3994	int i;
3995	for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
3996	{
3997		if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
))
3998			continue;
3999		if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref)
4000		{
4001			ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4002				.d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4003				.graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph,
4004			});
4005			if (pair_exec.d >= 0)
4006				ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4007		}
4008	}
4009	for (i = 0; i < graph_prep->sub_prep_size; i++)
4010		if (graph_prep->sub_preps[i])
4011			_ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4012}
4013 
4014static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4015{
4016	int i;
4017	if (graph_prep->dup_breakpoints)
4018	{
4019		// Strip the const modifier only possible because it is a sub-graph.
4020		ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4021		for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
4022			ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
 + (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i))));
4023		ccv_array_free(graph_prep->dup_breakpoints);
4024		graph_prep->dup_breakpoints = 0;
4025		graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4026		// Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4027		memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))), sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4028		// Since exec_symbol_info changed, create a new visit object.
4029		assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
 ({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 4029, __extension__ __PRETTY_FUNCTION__
); }));
4030		assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
 ({ if (symbolic_graph->destinations) ; else __assert_fail
 ("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 4030, __extension__ __PRETTY_FUNCTION__); }));
4031		ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)));
4032		const int source_size = symbolic_graph->sources->rnum;
4033		ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)));
4034		const int destination_size = symbolic_graph->destinations->rnum;
4035		ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
 ((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
 c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
 = 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
 = ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
 > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
 _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
 = (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4035, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
 } _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
 { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
 3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
 __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4035, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
 _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
 (_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
 (_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
 (((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
 ({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
 ("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4035, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
 1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
 _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
 _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
 ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
 d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
 * (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
 ((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
 (symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
 ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
 1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
 : 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4035, __extension__ __PRETTY_FUNCTION__
); })); _visit_; });
4036		ccv_nnc_graph_visit_free(graph_prep->visit);
4037		graph_prep->visit = visit;
4038		assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
 if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 4038, __extension__ __PRETTY_FUNCTION__
); }));
4039		ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4040	}
4041	ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
 ((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx; {
4042		for (i = 0; i < node->graph_ref_size; i++)
4043		{
4044			const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)[i] - 1;
4045			if (graph_ref >= 0)
4046				_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4047		}
4048	} ccv_nnc_graph_visit_endfor} }
4049}
4050 
4051const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4052 
4053void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4054{
4055	assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 4055, __extension__ __PRETTY_FUNCTION__); }));
4056	assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
 if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4056, __extension__ __PRETTY_FUNCTION__
); }));
4057	assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
 ({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4057, __extension__ __PRETTY_FUNCTION__
); }));
4058	int i;
4059	// Cannot bind the multi-view.
4060	for (i = 0; i < tensor_bind_size; i++)
4061	{
4062		assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
 ({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 4062, __extension__ __PRETTY_FUNCTION__
); }));
4063		assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 4063, __extension__ __PRETTY_FUNCTION__
); }));
4064	}
4065	ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4066	_ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4067	ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4068	_ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4069	*tensor_arena_ref = tensor_arena;
4070	// The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4071	_ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4072	// Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4073	_ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4074	*graph_ref = graph_prep->graph;
4075	ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4076	_ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4077	_ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4078	*graph_exec_arena_ref = graph_exec_arena;
4079	_ccv_nnc_symbolic_graph_prep_free(graph_prep);
4080}
4081 
4082static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4083{
4084	// Buffers are inherited from above, no need to dealloc.
4085	int i;
4086	for (i = 0; i < tensor_arena->sub_arena_size; i++)
4087		if (tensor_arena->sub_arenas[i])
4088			_ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4089	for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
4090	{
4091		ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
 (size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i))));
4092		assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 4092, __extension__ __PRETTY_FUNCTION__
); }));
4093		ccv_nnc_tensor_multiview_free(*mv);
4094	}
4095	ccv_array_free(tensor_arena->tensor_metadata);
4096	ccv_array_free(tensor_arena->m_tensor_idx);
4097	if (tensor_arena->pb_vt_tensors)
4098		ccfreefree(tensor_arena->pb_vt_tensors);
4099	if (tensor_arena->vt_alias_r_refs_p)
4100		ccfreefree(tensor_arena->vt_alias_r_refs_p);
4101	if (tensor_arena->vt_sizes)
4102		ccfreefree(tensor_arena->vt_sizes);
4103	ccfreefree(tensor_arena);
4104}
4105 
4106void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4107{
4108	assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
 == (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 4108, __extension__ __PRETTY_FUNCTION__
); }));
4109	assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4109, __extension__ __PRETTY_FUNCTION__
); }));
4110	assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({
 if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 4110, __extension__ __PRETTY_FUNCTION__
); }));
4111	// Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4112	int i;
4113	if (!tensor_arena->pb_vt_tensors)
4114	{
4115		tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4116		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4117			if (tensor_arena->vt_tensors[i])
4118				tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4119	}
4120	if (!tensor_arena->vt_alias_r_refs_p)
4121	{
4122		tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4123		tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4124		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4125			if (tensor_arena->vt_alias_refs[i])
4126			{
4127				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4128				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4128, __extension__ __PRETTY_FUNCTION__
); }));
4129				++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4130			}
4131		int refp = 0;
4132		for (i = 1; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4133			if (tensor_arena->vt_alias_r_refs_p[i])
4134				refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4135			else
4136				tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4137		for (i = refp; i < tensor_arena->vt_tensor_size; i++)
4138			tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4139		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4140			if (tensor_arena->vt_alias_refs[i])
4141			{
4142				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4143				assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
 >= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4143, __extension__ __PRETTY_FUNCTION__
); }));
4144				const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4145				assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if (
pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 4145, __extension__ __PRETTY_FUNCTION__); }));
4146				tensor_arena->vt_alias_r_refs[pos] = i;
4147			}
4148	}
4149	const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d;
4150	if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4151	{
4152		assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor
)->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4152, __extension__ __PRETTY_FUNCTION__
); })); // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4153		assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__
); }))
4154					ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__
); }))
4155				(size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4155, __extension__ __PRETTY_FUNCTION__
); }));
4156	} else
4157		{ assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
 ({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
 ("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4157, __extension__ __PRETTY_FUNCTION__
); })); }
4158	if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW
))
4159		{ assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors
[symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t
*)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else
 __assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4159, __extension__ __PRETTY_FUNCTION__
); })); }
4160	tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4161	if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4162		for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++)
4163		{
4164			const int d = tensor_arena->vt_alias_r_refs[i];
4165			if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it.
4166				break;
4167			ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4168			d_tensor->info.datatype = tensor->info.datatype;
4169			d_tensor->info.reserved = tensor->info.reserved;
4170			if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW))
4171				ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4172			else {
4173				d_tensor->data.u8 = tensor->data.u8;
4174				d_tensor->dataof = tensor->dataof;
4175			}
4176		}
4177}
4178 
4179void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4180{
4181	if (!tensor_arena->pb_vt_tensors)
4182		return;
4183	int i;
4184	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4185		if (tensor_arena->vt_tensors[i])
4186			tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4187}
4188 
4189uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4190{
4191	uint64_t total_size = 0;
4192	int i;
4193	for (i = 0; i < tensor_arena->buffer_size; i++)
4194		total_size += tensor_arena->buffers[i].size;
4195	return total_size;
4196}
4197 
4198static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4199{
4200	int i;
4201	if (mv->it)
4202		mv->it->info = params;
4203	for (i = 0; i < mv->repeat + mv->kind; i++)
4204	{
4205		ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[i];
4206		if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4207			_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4208		else
4209			tensor->info = params;
4210	}
4211}
4212 
4213int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4214{
4215	int i;
4216	assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph->
tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size
) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4216, __extension__ __PRETTY_FUNCTION__
); }));
4217	if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4218	{
4219		tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4220		for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4221			if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4222			{
4223				ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4224				if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4225				{
4226					ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4227					while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
4228						mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)[0]);
4229					tensor = (ccv_nnc_tensor_t*)mv;
4230				}
4231				tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4232			}
4233	}
4234	int flag = 0;
4235	for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++)
4236		if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4237		{
4238			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4239			ccv_nnc_tensor_param_t params = symbol_info->info;
4240			params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4241			params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4242			flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4243		}
4244	if (flag)
4245		return -1;
4246	for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4247		if (tensor_arena->vt_tensors[i])
4248		{
4249			ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
 (size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)));
4250			ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4251			if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4252			{
4253				assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0)
, __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else
 __assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c"
, 4253, __extension__ __PRETTY_FUNCTION__); }));
4254				_ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4255			} else if (!tensor_arena->vt_alias_refs[i]) {
4256				ccv_nnc_tensor_param_t params = symbol_info->info;
4257				params.datatype = tensor->info.datatype;
4258				params.reserved = tensor->info.reserved;
4259				tensor->info = params;
4260			} else {
4261				off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4262				ccv_nnc_tensor_param_t params = symbol_info->info;
4263				params.datatype = tensor->info.datatype;
4264				params.reserved = tensor->info.reserved;
4265				tensor->info = params;
4266				const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4267				ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4268				if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4269					((ccv_nnc_tensor_view_t*)tensor)->off = off;
4270			}
4271		}
4272	// Should handle sub_tensor_arena, don't do that at the moment.
4273	assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__
 ({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs"
, "ccv_nnc_symbolic_graph_compile.c", 4273, __extension__ __PRETTY_FUNCTION__
); }));
4274	return 0;
4275}
4276 
4277void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4278{
4279	assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum
 >= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__
 ({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena
->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 4279, __extension__ __PRETTY_FUNCTION__
); }));
4280	int i;
4281	for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
4282	{
4283		const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4284		if (graph_exec.d < 0)
4285			continue;
4286		const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4287		const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
 (size_t)(i)));
4288		ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4289		if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4290		{
4291			new_cmd.backend = existing_cmd.backend;
4292			new_cmd.algorithm = existing_cmd.algorithm;
4293		}
4294		ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4295	}
4296}
4297 
4298void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4299{
4300	int i;
4301	for (i = 0; i < tensor_arena->buffer_size; i++)
4302	{
4303		if (!tensor_arena->buffers[i].ptr)
4304			continue;
4305		const int buffer_type = tensor_arena->buffers[i].type;;
4306		const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
4307#ifdef HAVE_CUDA1
4308		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4309		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4310		{
4311			if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4312				tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4313			else
4314				cufree(device_id, tensor_arena->buffers[i].ptr);
4315		} else {
4316			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4316, __extension__ __PRETTY_FUNCTION__
); }));
4317			if (tensor_arena->buffers[i].pin_mem)
4318				cuhostfree(tensor_arena->buffers[i].ptr);
4319			else
4320				ccfreefree(tensor_arena->buffers[i].ptr);
4321		}
4322#elif defined(HAVE_MPS)
4323		const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4324		if (memory_type == CCV_TENSOR_GPU_MEMORY)
4325		{
4326			// if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4327			// 	tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4328			// else
4329			mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4330		} else {
4331			assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4331, __extension__ __PRETTY_FUNCTION__
); }));
4332			ccfreefree(tensor_arena->buffers[i].ptr);
4333		}
4334#else
4335		assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
 ; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4335, __extension__ __PRETTY_FUNCTION__
); }));
4336		ccfreefree(tensor_arena->buffers[i].ptr);
4337#endif
4338		tensor_arena->buffers[i].ptr = 0;
4339	}
4340	// For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4341	if (tensor_arena->disposers)
4342	{
4343		for (i = 0; i < tensor_arena->disposers->rnum; i++)
4344		{
4345			ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + (
size_t)(tensor_arena->disposers)->rsize * (size_t)(i)));
4346			disposer->dispose(disposer->ptr, disposer->userdata);
4347		}
4348		ccv_array_free(tensor_arena->disposers);
4349		tensor_arena->disposers = 0;
4350	}
4351}
4352 
4353void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4354{
4355	ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4356	_ccv_nnc_tensor_arena_free(tensor_arena);
4357}
4358 
4359void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4360{
4361	int i;
4362	for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
4363		if (graph_exec_arena->sub_arenas[i])
4364			ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4365	ccfreefree(graph_exec_arena);
4366}