Bug Summary

File:nnc/ccv_nnc_symbolic_graph_compile.c
Warning:line 3794, column 7
The left operand of '==' is a garbage value

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2026-05-17-061038-2658378-1 -x c ccv_nnc_symbolic_graph_compile.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "ccv_nnc_easy.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include "_ccv_nnc_graph.h"
11#include "_ccv_nnc_symbolic_graph.h"
12
13// MARK - Level-3 API
14
15typedef struct {
16 int flags;
17 int type;
18 int pin_mem; // This memory need to be pinned.
19 int ref; // Reference to another tensor block. Start with 1.
20 int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21 int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22 int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23 int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24 ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25 uint64_t size; // The size of the tensor expected.
26 int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28 ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29 ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30} ccv_nnc_tensor_block_t; // Tensor Arena Block
31
32#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
33
34enum {
35 UNASSIGNED = 0x1,
36 ALIAS = 0x2,
37 READ_ONLY = 0x4,
38 WRITE_ONLY = 0x8,
39 READ_WRITE = 0xc,
40 ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41 UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42 UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43};
44
45#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
46#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
47#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
48#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
50#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
51#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED))
(!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
52#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
53#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
54#define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
56#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
58#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
60
61#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
& CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
62
63// Holds additional information about the exe nodes.
64typedef struct {
65 int flags;
66} ccv_nnc_graph_exec_flag_t;
67
68enum {
69 CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70};
71
72typedef struct {
73 int index;
74 int oc;
75 int type;
76 uint64_t size;
77} ccv_nnc_tensor_opt_t;
78
79// We first sort the same type together (because they won't be reused at all.
80// And then we sort by size, after that, sort by oc.
81#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
*array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
*ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
{ insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
{ for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
= 0; left0 = left; right0 = right; pivot = left + (n/2); if(
n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
+ 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
: c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
= more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
= 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
_b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
_b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
} else left = left0, right = left0 + n - 1; } else if( m >
1 ) left = right0 - m + 1, right = right0; else break; } } }
}
83#undef more_than
84typedef struct {
85 int idx;
86 int hop;
87} ccv_nnc_tensor_hop_t;
88#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t
total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t
t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t
*ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t
* left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t
* ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh )
{ insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
{ for( ptr2 = ptr; ptr2 > left && less_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t
* left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0
; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t
* a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt
= 0; left0 = left; right0 = right; pivot = left + (n/2); if(
n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
+ 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux
) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b
, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? (
less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a)
) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a
: c)); a = right - 2*d, b = right - d, c = right; right = less_than
(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, *
c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
= less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than
(*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !less_than(*pivot, *left
, aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left >
left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
right && !less_than(*right, *pivot, aux) ) { if( !less_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
= 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
_b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
_b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
} else left = left0, right = left0 + n - 1; } else if( m >
1 ) left = right0 - m + 1, right = right0; else break; } } }
}
90#undef less_than
91
92// If b has items overlap with a, a is still after b (inclusive).
93static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94{
95 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__
); }))
;
96 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__
); }))
;
97 int x, y;
98 for (x = 0; x < b->rnum; x++)
99 {
100 const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)))
;
101 int flag = 0;
102 // In extreme cases where a is a superset of b, then a is still after b, we are good.
103 for (y = 0; !flag && y < a->rnum; y++)
104 {
105 const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
;
106 flag = (p == q);
107 }
108 if (!flag)
109 for (y = 0; y < a->rnum; y++)
110 {
111 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
, p);
112 if (!cell.i32 || cell.i32[0] == 0)
113 return 0;
114 }
115 }
116 // If b->rnum == 0, a is after b for sure.
117 // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118 // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119 return (a->rnum > 0 || b->rnum == 0);
120}
121
122static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123{
124 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__
__PRETTY_FUNCTION__); }))
;
125 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__
__PRETTY_FUNCTION__); }))
;
126 if (!a->rnum || !b->rnum)
127 return 0;
128 int x, y, max_hop = 0;
129 for (x = 0; x < a->rnum; x++)
130 {
131 ccv_sparse_matrix_vector_t* const vector = ccv_get_sparse_matrix_vector(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x)))
);
132 if (!vector)
133 return 0;
134 for (y = 0; y < b->rnum; y++)
135 {
136 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y)))
);
137 if (!cell.i32 || cell.i32[0] == 0)
138 return 0;
139 if (cell.i32[0] > max_hop)
140 max_hop = cell.i32[0];
141 }
142 }
143 // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
144 // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
145 return max_hop;
146}
147
148// If every a's head is deterministically after b's tail
149static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
150{
151 return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
152}
153
154typedef struct {
155 ccv_array_t** alloc_dep;
156 int vt_block_size;
157 int buffer_size;
158 int block_size;
159 int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
160 struct {
161 int type; // The type from tensor blocks.
162 int pin_mem; // Whether this is pinned memory.
163 int flags; // The flags (currently for READ_ONLY or not).
164 uint64_t size; // The size of the buffer allocated.
165 int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
166 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
167 }* buffers;
168 struct {
169 int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
170 int block_ref; // A reference to which block in the given tensor_block to use.
171 uint64_t offset; // The offset of this block.
172 }* blocks;
173} ccv_nnc_tensor_alloc_prep_t;
174
175typedef struct ccv_nnc_symbolic_graph_prep_s {
176 int flags;
177 int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
178 int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
179 int exec_idx;
180 int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
181 int tensor_symbol_info_size;
182 int exec_symbol_info_size;
183 int tensor_block_size;
184 int sub_prep_size;
185 ccv_nnc_tensor_block_t* tensor_blocks;
186 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
187 ccv_nnc_graph_exec_flag_t* exec_flags;
188 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
189 int* dup_tensor_block_ref;
190 ccv_nnc_graph_visit_t* visit;
191 ccv_nnc_tensor_alloc_prep_t* alloc_prep;
192 struct ccv_nnc_symbolic_graph_prep_s* p;
193 struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
194 // Structures that don't require to be freed after deallocation.
195 const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
196 ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
197 ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
198 ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
199} ccv_nnc_symbolic_graph_prep_t;
200
201typedef struct {
202 int oc;
203 ccv_array_t* itf;
204} ccv_nnc_tensor_block_adjacent_t;
205
206static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
207{
208 // Compute how many dis-continuous buffers are needed.
209 // We prefer to have several dis-continuous buffers instead of one big buffer because
210 // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
211 // to fully utilize memory.
212 int i, j, k;
213 ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
214 int allocable_tensor_size = 0, available_tensor_size = 0;
215 for (i = 0; i < tensor_block_size; i++)
216 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
217 {
218 // Tensors that we need the header info.
219 ++available_tensor_size;
220 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
221 // Tensors that we actually need to allocate (exclude the alias).
222 ++allocable_tensor_size;
223 }
224 ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
225 ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
226 ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
227 // Overlap count.
228 for (i = 0; i < tensor_block_size; i++)
229 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
230 for (j = i + 1; j < tensor_block_size; j++)
231 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED))
)
232 {
233 // We only reuse buffers within the same memory type. The tensor_dt / tensor_df
234 // matrices are only queried later for same-type candidates in this function,
235 // thus cross-type hop relations are not needed for allocation planning here.
236 if (tensor_blocks[i].type != tensor_blocks[j].type)
237 continue;
238 // Check to see if they interfere (default to yes).
239 // If any of the i's head is deterministically later than j's tail
240 // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
241 const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
242 int j_hop_i = 0;
243 if (i_hop_j > 0)
244 {
245 ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
246 ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
247 } else {
248 // It cannot be that both directions are positive. If i can hop to j, we don't
249 // need the reverse hop value for any subsequent allocation decision.
250 j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
251 if (j_hop_i > 0)
252 {
253 ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
254 ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
255 }
256 }
257 if (!i_hop_j && !j_hop_i)
258 {
259 if (!adj[i].itf)
260 adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
261 ccv_array_push(adj[i].itf, &j);
262 ++adj[i].oc;
263 if (!adj[j].itf)
264 adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
265 ccv_array_push(adj[j].itf, &i);
266 ++adj[j].oc;
267 }
268 }
269 const int exec_dep_rows = exec_dep->rows;
270 ccv_matrix_free(exec_dep);
271 ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
272 int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
273 uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
274 uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
275 uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
276 int num_assigned = 0;
277 // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
278 // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
279 // The first channel denotes the bytes available for allocation,
280 // the second channel denotes the offset available for the allocation,
281 ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
282 ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
283 for (j = 0; j < allocable_tensor_size;)
284 {
285 // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
286 uint64_t max_size = 0;
287 ccv_array_clear(opt);
288 int current_type = 0; // Deal with one type at a time.
289 for (i = 0; i < tensor_block_size; i++)
290 if (tensor_blocks[i].size >= max_size &&
291 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& !assigned[i] &&
292 IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
293 (!current_type || tensor_blocks[i].type == current_type))
294 {
295 ccv_nnc_tensor_opt_t a = {
296 .size = tensor_blocks[i].size,
297 .index = i,
298 .oc = adj[i].oc,
299 .type = tensor_blocks[i].type,
300 };
301 assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 301, __extension__ __PRETTY_FUNCTION__); }))
;
302 current_type = a.type; // Now we now the primary type we should deal with.
303 if (tensor_blocks[i].companion_ref)
304 {
305 const int companion_ref = tensor_blocks[i].companion_ref - 1;
306 a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; })
;
307 a.oc += adj[companion_ref].oc;
308 }
309 // In case we have a tie, take them all in the array.
310 if (a.size > max_size)
311 ccv_array_clear(opt), max_size = a.size;
312 ccv_array_push(opt, &a);
313 }
314 assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 314, __extension__ __PRETTY_FUNCTION__
); }))
;
315 // Order opt array by the oc because type and size should be equal at this point.
316 _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
317 // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
318 int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
319 uint64_t min_val[2] = {
320 0, 0
321 };
322 if (j > 0)
323 {
324 for (i = 0; i < opt->rnum; i++)
325 {
326 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(i)))
;
327 if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
328 continue;
329 // Now, determine the order between a and c. After this, we can always check whether y
330 // can hop to the earliest one and if the latest one can hop to x.
331 // The earliest one will be called p and the latest one will be called q.
332 int p = a.index;
333 int q = a.index;
334 if (tensor_blocks[a.index].companion_ref)
335 {
336 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
337 if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
338 continue;
339 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
340 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
341 p = companion_ref;
342 else {
343 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
344 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
345 q = companion_ref;
346 else { // Otherwise, b is in between p and q.
347 const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
348 const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
349 assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
&& b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
: 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 349, __extension__ __PRETTY_FUNCTION__
); }))
;
350 }
351 }
352 }
353 assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 353, __extension__ __PRETTY_FUNCTION__
); }))
;
354 const int type = tensor_blocks[p].type;
355 // y is always earlier than x, but this is hard to assert now.
356 // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
357 // Thus, the hop between y and x (through a) should be smallest ones.
358 // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
359 // out of q. For these nodes, we try to verify whether they form a connection (by checking against
360 // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
361 int y_size = 0;
362 ccv_nnc_tensor_hop_t* const y_buf = buf;
363#define for_block(y, val) do { \
364 if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \
365 y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
366 .idx = y + 1, .hop = ((int*)val)[0] \
367 }; \
368 } while(0)
369 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
370 if (y_vector)
371 CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
372#undef for_block
373 assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 373, __extension__ __PRETTY_FUNCTION__); }))
;
374 int x_size = 0;
375 ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
376#define for_block(x, val) do { \
377 if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \
378 x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
379 .idx = x + 1, .hop = ((int*)val)[0] \
380 }; \
381 } while(0)
382 ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
383 if (x_vector)
384 CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
385#undef for_block
386 assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
: 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 386, __extension__ __PRETTY_FUNCTION__
); }))
;
387 int x, y;
388 if (y_size > 1)
389 _ccv_nnc_sort_by_hops(y_buf, y_size, 0);
390 for (y = 0; y < y_size; y++)
391 {
392 const int hop = exec_dep_rows + y_buf[y].hop;
393 if (hop >= min_hop)
394 break;
395 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
396 if (val.u64 && val.u64[0] >= a.size)
397 {
398 min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
399 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
400 break;
401 }
402 }
403 if (x_size > 1)
404 _ccv_nnc_sort_by_hops(x_buf, x_size, 0);
405 for (x = 0; x < x_size; x++)
406 {
407 const int hop = exec_dep_rows + x_buf[x].hop;
408 if (hop >= min_hop)
409 break;
410 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
411 if (val.u64 && val.u64[0] >= a.size)
412 {
413 min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
414 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
415 break;
416 }
417 }
418 if (x_size > 0)
419 {
420 const int x_min_hop = x_buf[0].hop;
421 for (y = 0; y < y_size; y++)
422 {
423 const int y_hop_p_v = y_buf[y].hop;
424 if (y_hop_p_v + x_min_hop >= min_hop)
425 break;
426 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
427 if (y_vector)
428 {
429 for (x = 0; x < x_size; x++)
430 {
431 const int q_hop_x_v = x_buf[x].hop;
432 const int hop = y_hop_p_v + q_hop_x_v;
433 if (hop >= min_hop)
434 break;
435 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
436 if (val.u64 && val.u64[0] >= a.size)
437 {
438 min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
439 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
440 break;
441 }
442 }
443 }
444 }
445 }
446 // If I found a place, stop, and exit.
447 if (min_y > 0 || min_x < tensor_block_size + 1)
448 {
449 min_i = i;
450 break;
451 }
452 // There is no space to insert this block, mark it as such.
453 tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
454 if (tensor_blocks[a.index].companion_ref)
455 {
456 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
457 tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
458 }
459 }
460 }
461 // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
462 // and default to largest size available.
463 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))))
;
464 if (min_i == -1)
465 {
466 allocated_size[num_assigned] = a.size;
467 ++num_assigned;
468 }
469 int assign_group = num_assigned;
470 if (min_y > 0)
471 {
472 assign_group = assigned[min_y - 1];
473 // The y and x should belong to the same assigned group.
474 assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
- 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 474, __extension__ __PRETTY_FUNCTION__
); }))
;
475 } else if (min_x < tensor_block_size + 1)
476 assign_group = assigned[min_x - 1];
477 // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
478 if (min_y != 0 || min_x != tensor_block_size + 1)
479 {
480 uint64_t val[2] = {
481 min_val[0], min_val[1]
482 };
483 assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 483, __extension__ __PRETTY_FUNCTION__
); }))
;
484 val[0] -= a.size;
485 val[1] = val[1] + a.size; // Move the offset to the next one.
486 ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
487 }
488 int strings[3];
489 strings[0] = a.index + 1;
490 int string_size = 1;
491 // Assign out designated companion if it exist.
492 if (tensor_blocks[a.index].companion_ref)
493 {
494 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
495 assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 495, __extension__ __PRETTY_FUNCTION__
); }))
;
496 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
497 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
498 {
499 for (i = 0; i < string_size; i++)
500 strings[i + 1] = strings[i];
501 strings[0] = companion_ref + 1;
502 } else {
503 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
504 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
505 strings[string_size] = companion_ref + 1;
506 else {
507 // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
508 assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 508, __extension__ __PRETTY_FUNCTION__
); }))
;
509 strings[2] = strings[1];
510 strings[1] = companion_ref + 1;
511 }
512 }
513 ++string_size;
514 }
515 // Assign out and update oc.
516 for (i = 0; i < string_size; i++)
517 {
518 const int index = strings[i] - 1;
519 // Assign out the selected one.
520 assigned[index] = assign_group;
521 // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
522 allocated_offset[index] = min_val[1];
523 if (adj[index].itf)
524 for (k = 0; k < adj[index].itf->rnum; k++)
525 {
526 const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[
index].itf)->rsize * (size_t)(k)))
;
527 if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !(
(tensor_blocks[d].flags & 0x3) == UNASSIGNED))
)
528 --adj[d].oc;
529 }
530 }
531 uint64_t val[2] = {
532 a.size, min_val[1]
533 };
534 uint64_t consumed_size = 0;
535 // Go over from min_y to string_size (excluding min_x).
536 for (i = 0; i < string_size; i++)
537 {
538 const uint64_t size = tensor_blocks[strings[i] - 1].size;
539 assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 539, __extension__ __PRETTY_FUNCTION__
); }))
;
540 // Update consumed size if it is bigger than "size".
541 if (size > consumed_size)
542 {
543 val[0] = size - consumed_size;
544 ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
545 consumed_size = size;
546 val[1] = min_val[1] + consumed_size;
547 }
548 // If it consumed all the flow, break out.
549 if (consumed_size == a.size)
550 break;
551 }
552 for (i = 0; i < string_size; i++)
553 {
554 const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
555 uint64_t val[2] = {
556 i_size, min_val[1]
557 };
558 uint64_t consumed_size = 0;
559 for (k = i + 1; k < string_size; k++)
560 {
561 const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
< _b) ? _a : _b; })
;
562 // Update consumed size if it is bigger than "size".
563 if (size > consumed_size)
564 {
565 val[0] = size - consumed_size;
566 ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
567 consumed_size = size;
568 val[1] = min_val[1] + consumed_size;
569 }
570 // If it consumed all the flow, break out.
571 if (consumed_size == i_size)
572 break;
573 }
574 val[0] = i_size - consumed_size;
575 // Still have residual, flow it to min_x.
576 if (val[0] > 0)
577 ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
578 }
579 if (min_i == -1)
580 {
581 // If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
582 const int p = strings[0] - 1;
583 const int q = strings[string_size - 1] - 1;
584 const int type = tensor_blocks[p].type;
585#define for_block(y, val) do { \
586 if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \
587 { \
588 tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
589 if (tensor_blocks[y].companion_ref) \
590 { \
591 const int companion_ref = tensor_blocks[y].companion_ref - 1; \
592 tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
593 } \
594 } \
595 } while(0)
596 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
597 if (y_vector)
598 CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
599#undef for_block
600#define for_block(x, val) do { \
601 if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \
602 { \
603 tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
604 if (tensor_blocks[x].companion_ref) \
605 { \
606 const int companion_ref = tensor_blocks[x].companion_ref - 1; \
607 tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
608 } \
609 } \
610 } while(0)
611 ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
612 if (x_vector)
613 CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
614#undef for_block
615 }
616 j += string_size;
617 }
618 ccfreefree(tensor_block_cannot_insert);
619 ccfreefree(buf);
620 ccv_array_free(opt);
621 ccv_matrix_free(tensor_df);
622 ccv_matrix_free(tensor_dt);
623#define for_block(y, x, val) do { \
624 if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
625 { \
626 if (!alloc_dep[x - 1]) \
627 alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
628 ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
629 } \
630 } while (0)
631 CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
_i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
= (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
_v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
!_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
(_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
{ switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
} while (0)
;
632#undef for_block
633 ccv_matrix_free(alloc);
634 for (i = 0; i < tensor_block_size; i++)
635 if (adj[i].itf)
636 ccv_array_free(adj[i].itf);
637 ccfreefree(adj);
638 ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
639 alloc_prep->alloc_dep = alloc_dep;
640 alloc_prep->vt_block_size = tensor_block_size;
641 alloc_prep->buffer_size = num_assigned;
642 alloc_prep->block_size = available_tensor_size;
643 alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
644 alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
645 alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
646 memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
647 for (i = 0; i < num_assigned; i++)
648 alloc_prep->buffers[i].size = allocated_size[i];
649 if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels()))
650 {
651 size_t total_size = 0;
652 for (i = 0; i < num_assigned; i++)
653 total_size += allocated_size[i];
654 PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf
("Total buffer size of %zu to be allocated\n", total_size); fflush
(stdout); } } while (0)
;
655 }
656 ccfreefree(allocated_size);
657 j = 0;
658 // Assigning out the tensors (in case of sharing tensors / in-place ops).
659 for (i = 0; i < tensor_block_size; i++)
660 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
661 {
662 alloc_prep->blocks[j].block_ref = i;
663 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
664 {
665 alloc_prep->vt_blocks[i] = j;
666 // Also, set its allocations.
667 assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 667, __extension__ __PRETTY_FUNCTION__
); }))
;
668 const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
669 alloc_prep->blocks[j].offset = allocated_offset[i];
670 if (!alloc_prep->buffers[buffer_ref].type)
671 alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
672 alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
673 alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
674 assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 674, __extension__ __PRETTY_FUNCTION__
); }))
;
675 } else {
676 alloc_prep->vt_blocks[i] = -1;
677 alloc_prep->blocks[j].buffer_ref = -1;
678 alloc_prep->blocks[j].offset = 0;
679 }
680 ++j;
681 } else
682 alloc_prep->vt_blocks[i] = -1;
683 ccfreefree(allocated_offset);
684 ccfreefree(assigned);
685 return alloc_prep;
686}
687
688static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
689{
690 int i;
691 for (i = 0; i < alloc_prep->vt_block_size; i++)
692 if (alloc_prep->alloc_dep[i])
693 ccv_array_free(alloc_prep->alloc_dep[i]);
694 for (i = 0; i < alloc_prep->buffer_size; i++)
695 if (alloc_prep->buffers[i].dup_p_refs)
696 ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
697 ccfreefree(alloc_prep->alloc_dep);
698 ccfreefree(alloc_prep);
699}
700
701// Simple allocator from ccv_array_t.
702static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
703{
704 int pos = tensor_metadata->rnum;
705 int rsize = (size + 15) / 16;
706 ccv_array_resize(tensor_metadata, pos + rsize);
707 return (pos << 1) + 1;
708}
709
710static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
711{
712 assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 712, __extension__ __PRETTY_FUNCTION__
); }))
;
713 return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)))
;
714}
715
716#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
717
718static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
719{
720 // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
721 if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
722 return vt_tensor;
723 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
724 if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
725 {
726 const int alias_ref = tensor->alias_ref;
727 tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
728 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
729 }
730 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
731 {
732 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
733 int i;
734 const int count = mv->kind + mv->repeat;
735 for (i = 0; i < count; i++)
736 {
737 if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1)
)
738 {
739 const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i];
740 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
741 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
742 }
743 }
744 // No need to recursively do parent pointer, otherwise we are in deep rewire.
745 if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
746 mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
747 if (mv->sp)
748 for (i = 0; i < mv->sp->rnum; i++)
749 {
750 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
751 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
752 {
753 const int pos = (int)(intptr_t)*tensor;
754 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
755 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 755, __extension__ __PRETTY_FUNCTION__
); }))
;
756 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
757 }
758 }
759 }
760 return tensor;
761}
762
763typedef struct {
764 const uint8_t* ptr;
765 int pos;
766} ccv_nnc_tensor_block_pos_t;
767
768static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
769{
770 int i;
771 int unref_block_ref = block_ref;
772 while (prep->tensor_blocks[unref_block_ref].ref)
773 unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
774 int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
775 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 775, __extension__ __PRETTY_FUNCTION__); }))
;
776 assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
== prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 776, __extension__ __PRETTY_FUNCTION__
); }))
;
777 const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
778 uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
779 int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
780 for (i = idx - 1; i >= 0; i--)
781 {
782 assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
(p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 782, __extension__ __PRETTY_FUNCTION__); }))
;
783 const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
784 const int unroll_count = graph_prep->unroll_count;
785 if (ch[i]) // Prefer the dup side of things.
786 p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
787 int unref_p_ref = p_ref;
788 while (graph_prep->tensor_blocks[unref_p_ref].ref)
789 unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
790 vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
791 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
792 offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
793 // If the buffer already exists, prefer that.
794 const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
795 if (ptr)
796 {
797 // If I have any remaining path that is not covered from 0, I cannot possibly
798 // have any pointer from buffer (that can only happen if it is not dup).
799 for (--i; i >= 0; i--)
800 if (ch[i] != 0)
801 return 0;
802 // Try to find the created tensor block pos in the array, just linear scan.
803 const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
804 ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
805 *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
806 ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
807 return tv_pos;
808 }
809 p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
810 }
811 return 0;
812}
813
814// Descent from root to the prep level, and compose multiview from there.
815static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
816{
817 assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 817, __extension__ __PRETTY_FUNCTION__); }))
;
818 int i;
819 const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
820 const int unroll_count = prep->unroll_count;
821 if (prep == graph_prep)
822 {
823 const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
824 if (!data_pos)
825 return -1;
826 // Based on ch, go all the way back to find the exact pointer to compose.
827 if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
828 prep->dup_tensor_block_ref &&
829 prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
830 prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
831 {
832 int pos[unroll_count + 1];
833 pos[0] = data_pos;
834 for (i = 0; i < unroll_count; i++)
835 pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
836 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
837 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
838 ccv_nnc_tensor_t* data[unroll_count + 1];
839 for (i = 0; i < unroll_count + 1; i++)
840 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
841 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
842 for (i = 0; i < unroll_count + 1; i++)
843 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
844 *pos_ref = mv_pos;
845 } else {
846 *pos_ref = data_pos;
847 }
848 if (preserve)
849 {
850 // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
851 // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
852 // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
853 // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
854 // arena allocated).
855 // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
856 // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
857 // it to a K01 structure.
858 // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
859 // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
860 // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
861 int prev_mv_pos = *pos_ref;
862 if (prev_mv_pos == -1)
863 {
864 prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
865 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
866 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
867 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
868 tv,
869 }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
870 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
871 }
872 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
873 ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
874 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
875 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
876 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
877 (ccv_nnc_tensor_t*)prev_mv,
878 }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
879 prev_mv->p = (void*)(intptr_t)mv_pos;
880 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
881 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
882 *pos_ref = mv_pos;
883 }
884 return 0;
885 }
886 ch[idx] = 0;
887 int pos[unroll_count + 1];
888 pos[0] = 0;
889 const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
890 assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 890, __extension__ __PRETTY_FUNCTION__); }))
;
891 for (i = 0; i < unroll_count; i++)
892 {
893 ch[idx] = i + 1;
894 pos[i + 1] = 0;
895 const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
896 if (dup_retval < 0)
897 {
898 assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 898, __extension__ __PRETTY_FUNCTION__); }))
;
899 break;
900 }
901 }
902 // If current prep has no dup.
903 if (i == 0)
904 {
905 *pos_ref = pos[0];
906 return 0;
907 }
908 ccv_nnc_tensor_t* data[unroll_count + 1];
909 // Compose to a new multiview.
910 for (i = 0; i < unroll_count + 1; i++)
911 { assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
(pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 911, __extension__ __PRETTY_FUNCTION__); }))
; }
912 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
913 for (i = 0; i < unroll_count + 1; i++)
914 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
915 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
916 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
917 for (i = 0; i < unroll_count + 1; i++)
918 if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
919 ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
920 for (i = 0; i < unroll_count + 1; i++)
921 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
922 *pos_ref = mv_pos;
923 return 0;
924}
925
926static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
927{
928 int i;
929 int is_input = 0;
930 assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 930, __extension__ __PRETTY_FUNCTION__); }))
;
931 for (i = 0; i < node->input_size && !is_input; i++)
932 if (p_ref == node->inputs[i])
933 is_input = 1;
934 int is_output = 0;
935 for (i = 0; i < node->output_size && !is_output; i++)
936 if (p_ref == node->outputs[i])
937 is_output = 1;
938 // Prefer it is an output if it is both the input and the output.
939 if (is_output)
940 return 1;
941 if (is_input)
942 return -1;
943 return 0;
944}
945
946static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
947{
948 // No need to check whether to preserve if this is not a while loop.
949 if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
950 return 0;
951 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 951, __extension__ __PRETTY_FUNCTION__
); }))
;
952 // If it is unassigned, no need to preserve.
953 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
954 return 0;
955 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
956 // If p is not input, no need to preserve at all.
957 if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
958 return 0;
959 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
960 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 960, __extension__ __PRETTY_FUNCTION__); }))
;
961 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 961, __extension__ __PRETTY_FUNCTION__
); }))
;
962 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
963 // If the buffer is a truly read-only one, no need to preserve.
964 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
965 return 0;
966 /* This needs detailed explanation, what does preserve mean?
967 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
968 * also used outside of the while loop, we cannot reuse the memory region of x for
969 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
970 * y uses the same memory region as x). The way to workaround this is by using a different
971 * memory region for y = x + 1, but for the first iteration, having x pointing to the
972 * original. During the allocation process, the way to identify whether x should preserve
973 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
974 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
975 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
976 * it is the input tensor whenever that is possible. A tensor block can point to two parent
977 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
978 * tensor whenever that is possible. */
979 if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
980 return 0;
981 // Otherwise, return 1 because we now need to preserve.
982 return 1;
983}
984
985static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
986{
987 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 987, __extension__ __PRETTY_FUNCTION__
); }))
;
988 // If it is unassigned, no need to preserve.
989 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
990 return 0;
991 // Only tape var need to force broadcast, otherwise we already share the same memory region.
992 if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
993 return 0;
994 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
995 // If p is not output, no need to broadcast at all.
996 if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
997 return 0;
998 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
999 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 999, __extension__ __PRETTY_FUNCTION__); }))
;
1000 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1000, __extension__ __PRETTY_FUNCTION__
); }))
;
1001 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
1002 // If the buffer is a truly read-only one, no need to broadcast.
1003 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
1004 return 0;
1005 // Otherwise, return 1 because we now need to force broadcast for this tape var.
1006 return 1;
1007}
1008
1009static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
1010{
1011 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1011, __extension__ __PRETTY_FUNCTION__); }))
;
1012 int i;
1013 for (i = 0; i < mv->kind + mv->repeat; i++)
1014 if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
1015 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = tensor;
1016 else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
1017 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i], tensor);
1018}
1019
1020static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1021{
1022 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1022, __extension__ __PRETTY_FUNCTION__); }))
;
1023 int i;
1024 if (mv->sp)
1025 for (i = 0; i < mv->sp->rnum; i++)
1026 {
1027 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
1028 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
1029 {
1030 const int pos = (int)(intptr_t)*tensor;
1031 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1032 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 1032, __extension__ __PRETTY_FUNCTION__
); }))
;
1033 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1034 }
1035 }
1036 for (i = 0; i < mv->kind + mv->repeat; i++)
1037 {
1038 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]) & 1)
)
1039 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1040 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]->alias_ref) & 1)
)
1041 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref);
1042 if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
1043 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1044 }
1045}
1046
1047static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1048{
1049 // Go to the root of the graph.
1050 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1051 int i;
1052 for (i = 1; prep->p; i++)
1053 prep = prep->p;
1054 // Root graph should have no dup tensor blocks.
1055 assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1055, __extension__ __PRETTY_FUNCTION__); }))
;
1056 const int c = i;
1057 const ccv_nnc_symbolic_graph_prep_t* preps[c];
1058 prep = graph_prep;
1059 preps[c - 1] = prep;
1060 for (i = 0; prep->p; i++)
1061 preps[c - 2 - i] = prep = prep->p;
1062 int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1063 memset(ch, 0, sizeof(int) * c);
1064 int pos = 0;
1065 _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1066 assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
(ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1066, __extension__ __PRETTY_FUNCTION__); }))
; // This shouldn't never be modified.
1067 assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
> 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1067, __extension__ __PRETTY_FUNCTION__); }))
;
1068 return pos;
1069}
1070
1071static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1072{
1073 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1074 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1075 ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1076 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1077 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1078 tv,
1079 }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1080 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1081 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = tensor;
1082 return mv_pos;
1083}
1084
1085static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1086{
1087 ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1088 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1089 if (!is_multiview)
1090 return pos;
1091 while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1092 {
1093 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1094 tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1095 }
1096 const ccv_nnc_tensor_t tensor = *tensor_ptr;
1097 const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1098 ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1099 *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1100 new_tensor->dataof = tensor.dataof;
1101 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1102 new_tensor->alias_ref = (uintptr_t)pos;
1103 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1104 return new_pos;
1105}
1106
1107static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1108{
1109 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1110 // It referenced to is not an alias.
1111 assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__
({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1111, __extension__ __PRETTY_FUNCTION__
); }))
;
1112 const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1113 const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1114 assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1114, __extension__ __PRETTY_FUNCTION__
); }))
;
1115 // Will use that to determine whether insert reference or not.
1116 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1117 while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1118 {
1119 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1120 alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1121 }
1122 const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1123 // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1124 int pos;
1125 if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1126 ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim))
1127 {
1128 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1129 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1130 *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1131 tensor->dataof = alias_tensor.dataof;
1132 } else {
1133 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1134 ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1135 // Otherwise initialize a tensor view
1136 *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1137 tensor_view->alias_ref = (uintptr_t)alias_pos;
1138 }
1139 vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1140 if (is_multiview)
1141 {
1142 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1143 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1144 }
1145}
1146
1147static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1148{
1149 // If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1150 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref])
1151 {
1152 const int ref = tensor_blocks[block_ref].alias_ref - 1;
1153 if (!vt_tensors[ref])
1154 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1155 vt_tensors[block_ref] = vt_tensors[ref];
1156 return;
1157 }
1158 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1158, __extension__ __PRETTY_FUNCTION__
); }))
;
1159 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1160 // If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1161 if (!vt_tensors[alias_ref])
1162 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1163 _ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1164}
1165
1166// Turn a linear pointer to an object storage (such as MTLBuffer).
1167#ifdef HAVE_MPS
1168static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1169{
1170 mpobjfree(0, ptr);
1171}
1172#endif
1173
1174typedef struct {
1175 size_t size;
1176 void* obj;
1177} tensor_arena_obj_track_t;
1178
1179typedef struct {
1180 void* ptr;
1181 off_t offset;
1182 size_t size;
1183} obj_ptr_key_t;
1184
1185static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1186{
1187 return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1188}
1189
1190static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1191{
1192 return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1193}
1194
1195KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied
, upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* *
vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__
)) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t
*)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__
((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if
(h) { free((void *)h->keys); free(h->flags); free((void
*)h->vals); free(h); } } static inline __attribute__ ((__unused__
)) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h
->flags) { memset(h->flags, 0xaa, ((h->n_buckets) <
16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t
key) { if (h->n_buckets) { khint_t k, i, last, mask, step
= 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key
); i = k & mask; last = i; while (!((h->flags[i>>
4]>>((i&0xfU)<<1))&2) && (((h->
flags[i>>4]>>((i&0xfU)<<1))&1) || !
_kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step
)) & mask; if (i == last) return h->n_buckets; } return
((h->flags[i>>4]>>((i&0xfU)<<1))&
3)? h->n_buckets : i; } else return 0; } static inline __attribute__
((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t
new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (
--(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1,
(new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|=
(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>>
8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
>= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys
= (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets *
sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return
-1; } h->keys = new_keys; if (1) { void* *new_vals = (void
**)realloc((void *)h->vals,new_n_buckets * sizeof(void*));
if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals
; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if
(((h->flags[j>>4]>>((j&0xfU)<<1))&
3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t
new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals
[j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<<
1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t
tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp
; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<
1)); } else { h->keys[i] = key; if (1) h->vals[i] = val
; break; } } } } if (h->n_buckets > new_n_buckets) { h->
keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets
* sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc
((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h->
flags); h->flags = new_flags; h->n_buckets = new_n_buckets
; h->n_occupied = h->size; h->upper_bound = (khint_t
)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static
inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t
*h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied
>= h->upper_bound) { if (h->n_buckets > (h->size
<<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) <
0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr
(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets
; } } { khint_t k, i, site, last, mask = h->n_buckets - 1,
step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
(!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key
))) { if (((h->flags[i>>4]>>((i&0xfU)<<
1))&1)) site = i; i = (i + (++step)) & mask; if (i ==
last) { x = site; break; } } if (x == h->n_buckets) { if (
((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && site != h->n_buckets) x = site; else x = i; }
} } if (((h->flags[x>>4]>>((x&0xfU)<<
1))&2)) { h->keys[x] = key; (h->flags[x>>4]&=
~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->
n_occupied; *ret = 1; } else if (((h->flags[x>>4]>>
((x&0xfU)<<1))&1)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; *ret = 2; } else *ret = 0; return x; } static
inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t
*h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
--h->size; } }
1196
1197static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1198{
1199 if (params.dim[0] == 0)
1200 return 0;
1201#ifdef HAVE_MPS
1202 if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY)
1203 {
1204 int ret;
1205 const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >>
12]
* ccv_nnc_tensor_count(params);
1206 const obj_ptr_key_t key = {
1207 .ptr = ptr,
1208 .offset = offset,
1209 .size = size,
1210 };
1211 khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret);
1212 if (ret != 0)
1213 {
1214 void* obj = mpobjcreate(ptr, offset, size);
1215 if (!tensor_arena->disposers)
1216 tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1217 ccv_nnc_arena_disposer_t disposer = {
1218 .ptr = obj,
1219 .userdata = 0,
1220 .dispose = _ccv_nnc_tensor_arena_obj_dispose
1221 };
1222 ccv_array_push(tensor_arena->disposers, &disposer);
1223 kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj;
1224 return obj;
1225 } else
1226 return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]);
1227 }
1228#endif
1229 return ptr + offset;
1230}
1231
1232static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1233{
1234 // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1235 // Each tensor have the designation in assigned array, and offset in allocated_offset.
1236 const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1237 ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1238 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1239 const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1240 const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1241 const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1242 const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1243 const int unroll_count = graph_prep->unroll_count;
1244 int i, j;
1245 for (i = 0; i < tensor_symbol_info_size; i++)
1246 for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1247 {
1248 const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1249 if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1250 TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
)
;
1251 }
1252 ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1253 graph_prep->tensor_arena = tensor_arena;
1254 tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1255 tensor_arena->buffers = (void*)(tensor_arena + 1);
1256 tensor_arena->buffer_size = alloc_prep->buffer_size;
1257 tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1258 tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1259 tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1260 tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1261 tensor_arena->pb_vt_tensors = 0;
1262 tensor_arena->vt_alias_r_refs_p = 0;
1263 tensor_arena->vt_alias_r_refs = 0;
1264 tensor_arena->vt_sizes = 0;
1265 tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1266 tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, (sizeof(ccv_nnc_tensor_t) * tensor_symbol_info_size + 15) / 16, 0);
1267 tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1268 tensor_arena->allocator.context.free = allocator.context.free;
1269 tensor_arena->allocator.isa = allocator.isa;
1270 tensor_arena->disposers = 0;
1271 // Copy alias_ref info back to the tensor arena.
1272 for (i = 0; i < tensor_symbol_info_size; i++)
1273 tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1274 // Do the buffer copies.
1275 for (i = 0; i < alloc_prep->buffer_size; i++)
1276 tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1277 tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1278 tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1279 if (graph_prep->while_count_tensor)
1280 {
1281 // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1282 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1283 assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1283, __extension__ __PRETTY_FUNCTION__
); }))
; // pos must be 0 position.
1284 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1285 *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1286 }
1287 assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
&& !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
&& p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1287, __extension__ __PRETTY_FUNCTION__
); }))
;
1288 if (p_arena && p_graph_prep)
1289 {
1290 // Don't need to allocate the actual buffer, just use the pointer from the above.
1291 PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer assignment for sub arena %p (parent %p)\n",
tensor_arena, p_arena); fflush(stdout); } } while (0)
;
1292 for (i = 0; i < tensor_arena->buffer_size; i++)
1293 {
1294 const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1295 int unref_p_ref = p_ref;
1296 while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1297 unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1298 assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1298, __extension__ __PRETTY_FUNCTION__
); }))
;
1299 const int p_unroll_count = p_graph_prep->unroll_count;
1300 if (p_graph_prep->dup_tensor_block_ref &&
1301 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1302 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1303 {
1304 // This condition means in the parent graph, we point to multiple tensor blocks for the same
1305 // buffer, therefore, we cannot have one single pointer assigned in this case.
1306 // Later we will handle this by generate ccv_tensor_multiview_t structure.
1307 tensor_arena->buffers[i].ptr = 0;
1308 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1309 continue;
1310 }
1311 // Otherwise, find the actual buffer pointer.
1312 const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1313 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1313, __extension__ __PRETTY_FUNCTION__); }))
;
1314 const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1315 if (!p_arena->buffers[buffer_ref].ptr)
1316 {
1317 // Pass it down as 0 ptr.
1318 tensor_arena->buffers[i].ptr = 0;
1319 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1320 continue;
1321 }
1322 const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1323 tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1324 PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
(0)
;
1325 }
1326 } else {
1327 // Now, allocate actual buffers.
1328 PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0)
;
1329 for (i = 0; i < tensor_arena->buffer_size; i++)
1330 {
1331 const int buffer_type = tensor_arena->buffers[i].type;
1332 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1333 if (tensor_arena->buffers[i].size == 0)
1334 {
1335 tensor_arena->buffers[i].ptr = 0;
1336 PRINT(CCV_CLI_VERBOSE, "|-Skip buffer %d with size 0\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Skip buffer %d with size 0\n", i); fflush(stdout
); } } while (0)
;
1337 continue;
1338 }
1339#ifdef HAVE_CUDA1
1340 if (memory_type == CCV_TENSOR_GPU_MEMORY)
1341 {
1342 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1343 if (allocator.isa && allocator.isa->alloc)
1344 tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1345 else
1346 tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1347 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1348 } else {
1349 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1349, __extension__ __PRETTY_FUNCTION__
); }))
;
1350 if (tensor_arena->buffers[i].pin_mem)
1351 tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1352 else
1353 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1354 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1355 }
1356#elif defined(HAVE_MPS)
1357 if (memory_type == CCV_TENSOR_GPU_MEMORY)
1358 {
1359 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1360 // if (allocator.isa && allocator.isa->alloc)
1361 // tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1362 // else
1363 tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1364 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1365 } else {
1366 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1366, __extension__ __PRETTY_FUNCTION__
); }))
;
1367 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1368 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1369 }
1370#else
1371 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1371, __extension__ __PRETTY_FUNCTION__
); }))
;
1372 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1373 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1374#endif
1375 assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1375, __extension__ __PRETTY_FUNCTION__); }))
;
1376 }
1377 }
1378 // Go over sub_preps and allocate arenas for them. Do it this early because
1379 // we may reference tensors from sub arenas, the reason why we need to reference
1380 // tensors from sub arenas is because for output tensors, sub arena's tensor
1381 // will have automatic reference updates.
1382 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1383 if (graph_prep->sub_preps[i])
1384 tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1385 else
1386 tensor_arena->sub_arenas[i] = 0;
1387 memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1388 // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1389 ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1390#ifdef HAVE_MPS
1391 khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr();
1392#else
1393 khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0;
1394#endif
1395 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1396 if (tensor_arena->sub_arenas[i])
1397 {
1398 assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1398, __extension__ __PRETTY_FUNCTION__
); }))
;
1399 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1400 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1401 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1402 for (j = 0; j < node->output_size; j++)
1403 {
1404 const int idx = node->outputs[j];
1405 const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1;
1406 assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
(s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1406, __extension__ __PRETTY_FUNCTION__); }))
;
1407 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1408 assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1408, __extension__ __PRETTY_FUNCTION__); }))
;
1409 ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1410 // Only assign if it is a multiview tensor.
1411 if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1412 (sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1413 sub_arena_out_tensors[idx] = sub_tensor;
1414 }
1415 }
1416 // Assigning out the tensors (in case of sharing tensors / in-place ops).
1417 for (i = 0; i < tensor_symbol_info_size; i++)
1418 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
1419 {
1420 const int vt_ref = alloc_prep->vt_blocks[i];
1421 const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1422 // Either we have dup_tensor_block_ref in current layer, or we have that in
1423 // previous layer, therefore, cannot really find the buffer ptr.
1424 if (tensor_blocks[i].size > 0 &&
1425 (!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1426 ((graph_prep->dup_tensor_block_ref &&
1427 graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1428 graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1429 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1430 {
1431 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1431, __extension__ __PRETTY_FUNCTION__
); }))
; // This must be in a sub-graph.
1432 // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1433 if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1434 continue;
1435 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1436 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1437 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1438 } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1439 if (tensor_blocks[i].size == 0)
1440 {
1441 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1442 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1443 *tensor = ccv_nnc_tensor(0, tensor_symbol_info[i].info, 0);
1444 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1445 continue;
1446 }
1447 // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1448 const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1449 // If already created, use the same tensor, and continue.
1450 // Having ptr.
1451 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1452 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1453 // Also, set its allocations.
1454 // Since tensor view is bit compatible with tensor, we can just cast.
1455 void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1456 *tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1457 assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1457, __extension__ __PRETTY_FUNCTION__
); }))
;
1458 // If we need to force broadcast, we need to wrap it in a multiview.
1459 if (graph_prep->tensor_blocks[i].p_refs[0] &&
1460 _ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1461 {
1462 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1463 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1464 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1465 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1466 tv,
1467 }, 0, 1, graph_prep->graph, mv);
1468 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1469 pos = mv_pos;
1470 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1471 }
1472 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1473 }
1474 }
1475#ifdef HAVE_MPS
1476 kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map);
1477#endif
1478 // Handle binded tensors. First handle cases without aliases.
1479 for (i = 0; i < tensor_bind_size; i++)
1480 {
1481 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1481, __extension__ __PRETTY_FUNCTION__
); }))
;
1482 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1483 if (resolved_symbol.d >= 0)
1484 {
1485 int d = resolved_symbol.d;
1486 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1487 continue;
1488 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1489 // It has nothing to do with alias.
1490 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1491 d = tensor_blocks[d].ref - 1;
1492 // For binded tensors, it shouldn't be assigned yet.
1493 // If it is assigned, the pointer should match the ones from the binded tensor.
1494 // This can only happen if an enforced in-place tensor is binded twice. If that
1495 // happens, we need to make sure it is binded to the same location.
1496 assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1496, __extension__ __PRETTY_FUNCTION__
); }))
;
1497 // See above assertion.
1498 if (tensor_arena->vt_tensors[d])
1499 continue;
1500 if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1501 {
1502 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1503 ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1504 ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1505 if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1506 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1507 { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1507, __extension__ __PRETTY_FUNCTION__
); }))
; }
1508 // It is OK to be just as a whole smaller or equal to the binded one.
1509 assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1509, __extension__ __PRETTY_FUNCTION__
); }))
;
1510 memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1511 memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1512 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1513 } else {
1514 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1515 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1516 *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1517 tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1518 tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1519 tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1520 tv->dataof = tensor_binds[i].tensor->dataof;
1521 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1522 }
1523 }
1524 }
1525 // Handle binded tensors. We handle alias here so it can reference to binded tensors.
1526 for (i = 0; i < tensor_bind_size; i++)
1527 {
1528 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1528, __extension__ __PRETTY_FUNCTION__
); }))
;
1529 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1530 if (resolved_symbol.d >= 0)
1531 {
1532 int d = resolved_symbol.d;
1533 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1534 d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1535 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1536 // It has nothing to do with alias.
1537 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1538 d = tensor_blocks[d].ref - 1;
1539 if (tensor_arena->vt_tensors[d])
1540 continue;
1541 // Assert original alias has no ofs. Otherwise our binding will be problematic.
1542 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1543 { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j]
== 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol
.d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1543, __extension__ __PRETTY_FUNCTION__
); }))
; }
1544 if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1545 {
1546 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1547 ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1548 ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1549 if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1550 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1551 { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1551, __extension__ __PRETTY_FUNCTION__
); }))
; }
1552 // It is OK to be just as a whole smaller or equal to the binded one.
1553 assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1553, __extension__ __PRETTY_FUNCTION__
); }))
;
1554 memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1555 memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1556 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1557 } else {
1558 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1559 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1560 *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1561 tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1562 tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1563 tv->data = tensor_binds[i].tensor->data;
1564 tv->dataof = tensor_binds[i].tensor->dataof;
1565 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1566 }
1567 }
1568 }
1569 // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1570 // Avoiding refs that actually is an alias.
1571 for (i = 0; i < tensor_symbol_info_size; i++)
1572 // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1573 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref)
1574 {
1575 int ref = tensor_blocks[i].ref - 1;
1576 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1577 ref = tensor_blocks[ref].ref - 1;
1578 assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1578, __extension__ __PRETTY_FUNCTION__); }))
;
1579 tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1580 }
1581 // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1582 if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1583 {
1584 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1584, __extension__ __PRETTY_FUNCTION__
); }))
;
1585 const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1586 const int p_idx = graph_prep->p_idx - 1;
1587 for (i = 0; i < node->input_size; i++)
1588 {
1589 const int idx = node->inputs[i];
1590 int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx)))
- 1;
1591 assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1591, __extension__ __PRETTY_FUNCTION__); }))
;
1592 const int vt_ref = alloc_prep->vt_blocks[block_ref];
1593 if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1594 continue;
1595 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1595, __extension__ __PRETTY_FUNCTION__); }))
;
1596 const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1597 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1597, __extension__ __PRETTY_FUNCTION__); }))
;
1598 assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1598, __extension__ __PRETTY_FUNCTION__
); }))
;
1599 // Either we have dup_tensor_block_ref in current layer, or we have that in
1600 // previous layer, therefore, cannot really find the buffer ptr.
1601 if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1602 ((graph_prep->dup_tensor_block_ref &&
1603 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1604 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1605 !tensor_arena->buffers[buffer_ref].ptr))
1606 {
1607 // We haven't allocated anything for this yet.
1608 assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1608, __extension__ __PRETTY_FUNCTION__
); }))
;
1609 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1610 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1611 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1612 } else {
1613 const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1614 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1615 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1616 }
1617 }
1618 }
1619 // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1620 // This created the multi-view tensor to achieve that.
1621 for (i = 0; i < tensor_symbol_info_size; i++)
1622 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1623 {
1624 const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1625 // Create phi multi-view.
1626 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1627 const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1628 const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1629 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1630 ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1631 ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1632 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1633 intv,
1634 outv,
1635 }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1636 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1637 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1638 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1639 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1640 }
1641 // Now it is time to handle alias.
1642 for (i = 0; i < alloc_prep->block_size; i++)
1643 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1644 {
1645 const int block_ref = alloc_prep->blocks[i].block_ref;
1646 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref])
1647 {
1648 // Assigning out the tensor aliases.
1649 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1649, __extension__ __PRETTY_FUNCTION__
); }))
;
1650 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1651 }
1652 }
1653 // Now assigning out the rest of alias refs.
1654 for (i = 0; i < tensor_symbol_info_size; i++)
1655 // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1656 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i])
1657 {
1658 int ref = tensor_blocks[i].alias_ref - 1;
1659 assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1659, __extension__ __PRETTY_FUNCTION__); }))
;
1660 tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1661 }
1662 // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1663 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1664 if (tensor_arena->sub_arenas[i])
1665 {
1666 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1667 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1668 for (j = 0; j < node->input_size; j++)
1669 {
1670 const int idx = node->inputs[j];
1671 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1672 if (s_idx < 0)
1673 continue;
1674 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1675 // Only do the replacement if it is a multi-view tensor.
1676 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1677 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1678 {
1679 // It cannot be binded tensor.
1680 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1680, __extension__ __PRETTY_FUNCTION__
); }))
;
1681 const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1682 const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1683 ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1684 // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1685 // to this tensor.
1686 if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1687 {
1688 const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1689 ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1690 ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1691 ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1692 ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1693 ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
: (multiview)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]);
1694 while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1695 tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]);
1696 *ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1697 ref_tensor->data = tv->data;
1698 ref_tensor->dataof = tv->dataof;
1699 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1700 } else
1701 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1702 }
1703 }
1704 }
1705 // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1706 // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1707 // when initialize case..of node, which will take the phi multi-view again.
1708 for (i = 0; i < tensor_symbol_info_size; i++)
1709 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1710 {
1711 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
& 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1711, __extension__ __PRETTY_FUNCTION__
); }))
;
1712 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1713 assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1713, __extension__ __PRETTY_FUNCTION__); }))
;
1714 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1715 }
1716 // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1717 for (i = 0; i < tensor_symbol_info_size; i++)
1718 if (tensor_arena->vt_tensors[i])
1719 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1720 // Associate multiview tensors from sub arena to the parent.
1721 if (sub_arena_out_tensors)
1722 {
1723 for (i = 0; i < alloc_prep->block_size; i++)
1724 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1725 {
1726 const int block_ref = alloc_prep->blocks[i].block_ref;
1727 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1728 continue;
1729 int sub_arena_ref = block_ref;
1730 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1731 {
1732 // Assigning out the tensor aliases.
1733 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1733, __extension__ __PRETTY_FUNCTION__
); }))
;
1734 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1735 // It referenced to is not an alias.
1736 assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1736, __extension__ __PRETTY_FUNCTION__
); }))
;
1737 sub_arena_ref = alias_ref;
1738 if (!sub_arena_out_tensors[sub_arena_ref])
1739 continue;
1740 }
1741 if (!sub_arena_out_tensors[sub_arena_ref])
1742 continue;
1743 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1744 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1744, __extension__ __PRETTY_FUNCTION__); }))
;
1745 // This is only possible if the vt_tensors is a phi node.
1746 if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1747 {
1748 // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1749 ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1750 assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1750, __extension__ __PRETTY_FUNCTION__); }))
;
1751 assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1751, __extension__ __PRETTY_FUNCTION__
); }))
;
1752 CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]->alias_ref = (uintptr_t)mv;
1753 ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]);
1754 } else {
1755 tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1756 ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1757 }
1758 }
1759 }
1760 // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1761 // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1762 // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1763 // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1764 // to the output of assign_ref tensor.
1765 for (i = 0; i < tensor_symbol_info_size; i++)
1766 if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1767 {
1768 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1769 ccv_nnc_tensor_t* assign_tensor;
1770 if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1771 assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1772 else
1773 assign_tensor = tensor_arena->vt_tensors[assign_ref];
1774 ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1775 }
1776 // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1777 for (i = 0; i < tensor_bind_size; i++)
1778 {
1779 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1779, __extension__ __PRETTY_FUNCTION__
); }))
;
1780 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1781 if (resolved_symbol.d >= 0)
1782 {
1783 int d = resolved_symbol.d;
1784 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1785 // It has nothing to do with alias.
1786 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1787 d = tensor_blocks[d].ref - 1;
1788 // Note we don't trace back on alias. This is intentional.
1789 assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 ==
tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ (
{ if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1789, __extension__ __PRETTY_FUNCTION__
); }))
;
1790 }
1791 }
1792 if (sub_arena_out_tensors)
1793 ccfreefree(sub_arena_out_tensors);
1794 // Rewire sub arena's tensor references.
1795 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1796 if (tensor_arena->sub_arenas[i])
1797 {
1798 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1799 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1800 for (j = 0; j < node->input_size; j++)
1801 {
1802 const int idx = node->inputs[j];
1803 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1804 if (s_idx < 0)
1805 continue;
1806 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1807 // Only do the replacement if it is a multi-view tensor.
1808 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1809 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1810 {
1811 // This is binded tensor, bind it now.
1812 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1813 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1814 else
1815 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1816 }
1817 }
1818 }
1819 return tensor_arena;
1820}
1821
1822static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1823{
1824 assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1824, __extension__ __PRETTY_FUNCTION__); }))
;
1825 if ((intptr_t)graph == tensor_arena->graph_ref)
1826 {
1827 assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >=
0 && pair_ref < tensor_arena->vt_tensor_size) ;
else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1827, __extension__ __PRETTY_FUNCTION__
); }))
;
1828 return tensor_arena->vt_tensors[pair_ref];
1829 }
1830 int i;
1831 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1832 if (tensor_arena->sub_arenas[i])
1833 {
1834 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1835 if (tensor)
1836 return tensor;
1837 }
1838 return 0;
1839}
1840
1841static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1842{
1843 if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1844 tensor->type |= CCV_TAPE_ALLOC;
1845 else {
1846 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1847 mv->type |= CCV_TAPE_ALLOC;
1848 int i;
1849 for (i = 0; i < mv->repeat + mv->kind; i++)
1850 _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1851 }
1852}
1853
1854static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1855{
1856 assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
__assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1856, __extension__ __PRETTY_FUNCTION__
); }))
;
1857 int i;
1858 for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1859 {
1860 if (graph_prep->tensor_symbol_info[i].pair_ref)
1861 {
1862 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1863 // No need to continue check this if it is from its pair.
1864 continue;
1865 }
1866 if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1867 {
1868 // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1869 if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
)
)
1870 {
1871 const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1872 if (vt_ref >= 0 &&
1873 TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc)
== READ_ONLY)
1874 continue;
1875 }
1876 _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1877 }
1878 }
1879 for (i = 0; i < graph_prep->sub_prep_size; i++)
1880 if (graph_prep->sub_preps[i])
1881 _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1882}
1883
1884static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1885{
1886 int i, found = 0;
1887 // Try to insert head.
1888 ccv_array_t* head = tensor_blocks.head;
1889 assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1889, __extension__ __PRETTY_FUNCTION__); }))
;
1890 for (i = 0; i < head->rnum;)
1891 {
1892 const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
;
1893 if (head_idx == idx)
1894 {
1895 found = 1;
1896 break;
1897 }
1898 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1899 if (cell.i32 && cell.i32[0] > 0)
1900 {
1901 /* If the current node is the parent of the head node, check if we found it or not. */
1902 /* If not found, replace the current one. */
1903 if (!found)
1904 {
1905 found = 1;
1906 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= idx;
1907 } else {
1908 /* Remove the current one, change the rnum. */
1909 if (i < head->rnum - 1)
1910 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(head->rnum - 1)))
;
1911 --head->rnum;
1912 continue;
1913 }
1914 } else {
1915 // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1916 cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1917 if (cell.i32 && cell.i32[0] > 0)
1918 {
1919 found = 1;
1920 break;
1921 }
1922 }
1923 /* Advancing i. */
1924 ++i;
1925 }
1926 /* If not found, push this idx to the end of the array. */
1927 if (!found)
1928 ccv_array_push(head, &idx);
1929 // Try to insert tail.
1930 found = 0;
1931 ccv_array_t* tail = tensor_blocks.tail;
1932 assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1932, __extension__ __PRETTY_FUNCTION__); }))
;
1933 for (i = 0; i < tail->rnum;)
1934 {
1935 const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
;
1936 if (tail_idx == idx)
1937 {
1938 found = 1;
1939 break;
1940 }
1941 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1942 if (cell.i32 && cell.i32[0] > 0)
1943 {
1944 /* If the current node is the child of the tail node, check if we found it or not. */
1945 /* If not found, replace the current one. */
1946 if (!found)
1947 {
1948 found = 1;
1949 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= idx;
1950 } else {
1951 /* Remove the current one, change the rnum. */
1952 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(tail->rnum - 1)))
;
1953 --tail->rnum;
1954 continue;
1955 }
1956 } else {
1957 // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1958 cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1959 if (cell.i32 && cell.i32[0] > 0)
1960 {
1961 found = 1;
1962 break;
1963 }
1964 }
1965 /* Advancing i. */
1966 ++i;
1967 }
1968 /* If not found, push this idx to the end of the array. */
1969 if (!found)
1970 ccv_array_push(tail, &idx);
1971}
1972
1973ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1974{
1975 if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1976 {
1977 assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
0 && symbol.d < tensor_arena->vt_tensor_size) ;
else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1977, __extension__ __PRETTY_FUNCTION__
); }))
;
1978 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1979 if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1980 {
1981 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1982 while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1983 mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1984 return (ccv_nnc_tensor_t*)mv;
1985 }
1986 return tensor;
1987 }
1988 int i;
1989 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1990 if (tensor_arena->sub_arenas[i])
1991 {
1992 ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1993 if (tensor)
1994 return tensor;
1995 }
1996 return 0;
1997}
1998
1999ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
2000{
2001 if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
2002 {
2003 assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
>= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 2003, __extension__ __PRETTY_FUNCTION__
); }))
;
2004 return graph_exec_arena->graph_execs[symbol.d];
2005 }
2006 int i;
2007 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
2008 if (graph_exec_arena->sub_arenas[i])
2009 {
2010 ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
2011 if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
2012 return exec;
2013 }
2014 return (ccv_nnc_graph_exec_t){}; // 0.
2015}
2016
2017ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2018{
2019 return graph_exec_arena->source;
2020}
2021
2022ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2023{
2024 return graph_exec_arena->destination;
2025}
2026
2027// Check whether the head is the beginning of this block.
2028static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
2029{
2030 assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 2030, __extension__ __PRETTY_FUNCTION__
); }))
;
2031 return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0)))
== head_node);
2032}
2033
2034// Check whether the tail is the end of this block.
2035static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2036{
2037 assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 2037, __extension__ __PRETTY_FUNCTION__
); }))
;
2038 return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0)))
== tail_node);
2039}
2040
2041// Make two tensor blocks one. Return 1 if that happened.
2042static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2043{
2044 // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2045 if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
2046 (!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
2047 tensor_blocks[p_ref_0].tail->rnum == 1 &&
2048 tensor_blocks[p_ref_1].head->rnum == 1 &&
2049 tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
2050 *(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
)
== *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
)
)
2051 {
2052 // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2053 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 2053, __extension__ __PRETTY_FUNCTION__); }))
;
2054 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 2054, __extension__ __PRETTY_FUNCTION__); }))
;
2055 ccv_array_free(tensor_blocks[p_ref_0].tail);
2056 tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2057 if (tensor_blocks[p_ref_1].p_refs[0])
2058 {
2059 assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2059, __extension__ __PRETTY_FUNCTION__
); }))
; // It simply cannot have more than one p_refs, otherwise we cannot merge.
2060 if (!tensor_blocks[p_ref_0].p_refs[0])
2061 tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2062 else
2063 tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2064 }
2065 tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2066 TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
& ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)))
;
2067 ccv_array_free(tensor_blocks[p_ref_1].head);
2068 if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
2069 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
| UNFOLDABLE_AS_INPUT))
;
2070 // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2071 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
& ~0x3) | UNASSIGNED))
;
2072 tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2073 if (!tensor_blocks[p_ref_0].r_refs)
2074 tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2075 ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2076 tensor_blocks[p_ref_1].size = 0;
2077 tensor_blocks[p_ref_1].head = 0;
2078 tensor_blocks[p_ref_1].tail = 0;
2079 return 1;
2080 }
2081 return 0;
2082}
2083
2084static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2085{
2086 int i, j, k;
2087 // Generate exec dependencies (or, in other words, partial ordering of executions).
2088 ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2089 int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2090 int buf_size;
2091 if (p_node_info)
2092 { assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2092, __extension__ __PRETTY_FUNCTION__
); }))
; }
2093#define for_block(x, val) \
2094 do { \
2095 if (((int32_t*)val)[0] > 0) \
2096 { \
2097 buf[buf_size * 2] = x; \
2098 buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2099 ++buf_size; \
2100 } \
2101 } while (0)
2102 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx;
{
2103 buf_size = 0; /* save all its parent deps to this buffer */
2104 ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2105 if (vector)
2106 CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
2107 if (!node->outgoings)
2108 continue;
2109 for (i = 0; i < node->outgoings->rnum; i++)
2110 {
2111 int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
2112 const int32_t one = 1;
2113 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2114 /* If not found, set, if the current node is the destination node, no need
2115 * set itself as parent of subsequent nodes because its terminal nature. */
2116 if (!cell.i32 || cell.i32[0] == 0)
2117 ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2118 if (buf_size > 0)
2119 {
2120 ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2121 assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector
) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c"
, 2121, __extension__ __PRETTY_FUNCTION__); }))
;
2122 for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
2123 {
2124 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2125 /* If not found, set */
2126 if (!cell.i32 || cell.i32[0] == 0)
2127 ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2128 else {
2129 /* Otherwise, set to the longest one */
2130 int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
+ 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; })
;
2131 ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2132 }
2133 }
2134 }
2135 }
2136 } ccv_nnc_graph_visit_endfor} }
2137#undef for_block
2138 ccfreefree(buf);
2139 // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2140 const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2141 ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2142 // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2143 // happens that I have to loop through all relevant node to find out if one is used or not.
2144 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2145 tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2146 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2147 for (i = 0; i < node->input_size; i++)
2148 if (node->inputs[i] >= 0)
2149 {
2150 tensor_blocks[node->inputs[i]].flags = 0;
2151 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2152 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2153 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2154 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2155 tensor_blocks[node->inputs[i]].pin_mem = 1;
2156 }
2157 for (i = 0; i < node->output_size; i++)
2158 if (node->outputs[i] >= 0)
2159 {
2160 tensor_blocks[node->outputs[i]].flags = 0;
2161 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2162 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2163 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2164 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2165 tensor_blocks[node->outputs[i]].pin_mem = 1;
2166 }
2167 } ccv_nnc_graph_visit_endfor} }
2168 if (p_node_info)
2169 {
2170 assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 2170, __extension__ __PRETTY_FUNCTION__
); }))
;
2171 // Mark it as used if it is used in either input or output.
2172 for (i = 0; i < p_node_info->input_size; i++)
2173 if (p_node_info->inputs[i] >= 0)
2174 {
2175 const int d = p_node_info->inputs[i];
2176 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2177 {
2178 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
2179 if (dd >= 0) // If this exists in this sub-graph, great.
2180 tensor_blocks[dd].flags = 0;
2181 }
2182 }
2183 for (i = 0; i < p_node_info->output_size; i++)
2184 if (p_node_info->outputs[i] >= 0)
2185 {
2186 const int d = p_node_info->outputs[i];
2187 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2188 {
2189 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
2190 if (dd >= 0) // If this exists in this sub-graph, great.
2191 tensor_blocks[dd].flags = 0;
2192 }
2193 }
2194 }
2195 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2196 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2197 {
2198 // Check no tensor info is auto now.
2199 assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 2199, __extension__ __PRETTY_FUNCTION__
); }))
;
2200 // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2201 // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2202 // fold to).
2203 if (tensor_symbol_info[i].assign_ref)
2204 {
2205 // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2206 // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2207 // it kept its own representation, which is not the case for output).
2208 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
2209 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2210 // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2211 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
2212 // It also cannot be folded as output (except i), because we need to keep its own representation.
2213 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2214 assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
== 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2214, __extension__ __PRETTY_FUNCTION__
); }))
;
2215 tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2216 for (j = 0; j < unroll_count; j++)
2217 {
2218 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT))
;
2219 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2220 }
2221 if (tensor_blocks[assign_ref].bypass_ref)
2222 {
2223 // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2224 tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2225 const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2226 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
2227 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2228 // On the other hand, it can be folded into the except_ref for the bypass_ref.
2229 tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2230 if (dup_tensor_from_ref)
2231 {
2232 const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2233 if (bypass_from_ref >= 0)
2234 {
2235 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT))
;
2236 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT))
;
2237 assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
+ unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
- 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2237, __extension__ __PRETTY_FUNCTION__
); }))
;
2238 for (j = 0; j < unroll_count - 1; j++)
2239 {
2240 // Mark every incarnation as unfold-able.
2241 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_INPUT))
;
2242 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT))
;
2243 }
2244 }
2245 }
2246 }
2247 }
2248 }
2249 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2250 {
2251 // If it has a pair reference, we don't need to allocate this tensor at all,
2252 // set it to be unassigned.
2253 if (tensor_symbol_info[i].pair_ref)
2254 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED))
;
2255 // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2256 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2257 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2258 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
2259 // For this case, there is no exception.
2260 tensor_blocks[i].unfoldable_except_ref = 0;
2261 } else if (tensor_symbol_info[i].p_ref) {
2262 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 2262, __extension__ __PRETTY_FUNCTION__); }))
;
2263 const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2264 // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2265 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2266 // TODO: This check can be lifted if we can fold in the parent graph.
2267 if (-1 == p_ref_is_in_or_out)
2268 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2269 if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2270 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2271 }
2272 }
2273 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2274 {
2275 if (tensor_symbol_info[i].alias_ref)
2276 {
2277 const int ref = tensor_symbol_info[i].alias_ref - 1;
2278 // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2279 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2280 tensor_blocks[ref].flags = 0;
2281 // An alias cannot ref to another alias.
2282 assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
__assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2282, __extension__ __PRETTY_FUNCTION__); }))
;
2283 tensor_blocks[i].flags = ALIAS;
2284 tensor_blocks[i].ref = ref + 1; // Assign the ref.
2285 if (!tensor_blocks[ref].r_refs)
2286 tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2287 ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2288 }
2289 }
2290 // Scan again and if the ref is not assigned, mark the alias not assigned.
2291 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2292 if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2293 {
2294 const int ref = tensor_blocks[i].ref - 1;
2295 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2296 {
2297 // Mark this as unassigned.
2298 tensor_blocks[i].flags = UNASSIGNED;
2299 tensor_blocks[i].ref = 0;
2300 }
2301 }
2302 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2303 {
2304 // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2305 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
2306 {
2307 tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2308 tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2309 // Cache tensor size (align to 16 bytes).
2310 tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2311 }
2312 // If there is a p_ref, add the one to the p_refs list.
2313 if (tensor_symbol_info[i].p_ref)
2314 tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2315 }
2316 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2317 for (i = 0; i < node->input_size; i++)
2318 {
2319 int d = node->inputs[i];
2320 if (d < 0)
2321 continue;
2322 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2323 d = tensor_symbol_info[d].alias_ref - 1;
2324 tensor_blocks[d].flags |= READ_ONLY;
2325 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2326 continue;
2327 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2327, __extension__ __PRETTY_FUNCTION__
); }))
;
2328 /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2329 * from the very beginning of the graph life-cycle and ends here. */
2330 if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
2331 {
2332 for (j = 0; j < source_size; j++)
2333 {
2334 // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2335 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2336 if (cell.i32 && cell.i32[0] > 0)
2337 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2338 }
2339 /* If this is a read-only (based on SSA, if first encountered as read), and this is
2340 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2341 * loop, however, in that case, you need to prevent read-only gets reused for the
2342 * output tensor, which is not obvious how to implement correctly), and it is not
2343 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2344 * of memory anyway (because on second loop, we want to read the same value out).
2345 * Mark it to the end of the graph. */
2346 if (p_node_info && !tensor_symbol_info[d].assign_ref)
2347 for (j = 0; j < destination_size; j++)
2348 {
2349 // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2350 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2351 if (cell.i32 && cell.i32[0] > 0)
2352 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2353 }
2354 }
2355 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2356 }
2357 for (i = 0; i < node->output_size; i++)
2358 {
2359 int d = node->outputs[i];
2360 if (d < 0)
2361 continue;
2362 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2363 d = tensor_symbol_info[d].alias_ref - 1;
2364 tensor_blocks[d].flags |= WRITE_ONLY;
2365 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2366 continue;
2367 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2367, __extension__ __PRETTY_FUNCTION__
); }))
;
2368 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2369 }
2370 } ccv_nnc_graph_visit_endfor} }
2371 // For any assign_ref, its life-time kept until the end and wrap over.
2372 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2373 // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2374 // that "somewhere else" need to keep its life-time til the end.
2375 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&&
2376 p_node_info && tensor_symbol_info[i].assign_ref)
2377 {
2378 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2379 for (j = 0; j < destination_size; j++)
2380 {
2381 // This logic is to be more conservative about which destination we add to.
2382 // As of now, if we add everything, it is fine most likely. However, it may
2383 // cause issues in the future to do so naively. Thus, instead, we only add
2384 // the destination to it iff either the tensor is not used at all, or, the
2385 // destination is on the same stream as of the tensor block some way.
2386 int flag = !tensor_blocks[assign_ref].tail;
2387 for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2388 {
2389 const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
+ (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)))
;
2390 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2391 flag = (cell.i32 && cell.i32[0] > 0);
2392 }
2393 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2394 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2395 }
2396 }
2397 for (i = 0; i < output_size; i++)
2398 {
2399 assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
__assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2399, __extension__ __PRETTY_FUNCTION__); }))
;
2400 int d = outputs[i].d;
2401 if (d < 0)
2402 continue;
2403 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2404 d = tensor_symbol_info[d].alias_ref - 1;
2405 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2406 continue;
2407 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2407, __extension__ __PRETTY_FUNCTION__
); }))
;
2408 for (j = 0; j < destination_size; j++)
2409 {
2410 int flag = !tensor_blocks[d].tail;
2411 for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2412 {
2413 const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
2414 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2415 flag = (cell.i32 && cell.i32[0] > 0);
2416 }
2417 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2418 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2419 }
2420 }
2421 // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2422 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2423 int x, y;
2424 for (x = 0; x < node->input_size; x++)
2425 for (y = 0; y < node->output_size; y++)
2426 /* Some operations enforces some tensors to be the same for inputs / outputs. */
2427 if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2428 {
2429 // If both unassigned, it is fine.
2430 if (node->inputs[x] < 0 && node->outputs[y] < 0)
2431 continue;
2432 int ref = node->inputs[x];
2433 assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2433, __extension__ __PRETTY_FUNCTION__); }))
;
2434 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2435 ref = tensor_blocks[ref].ref - 1;
2436 const int node_output_y = node->outputs[y];
2437 assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2437, __extension__ __PRETTY_FUNCTION__
); }))
;
2438 // If both are not computable, it is fine, we don't need to enforce.
2439 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2440 !TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
!((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
))
)
2441 continue;
2442 // Otherwise, enforce and error out if failed.
2443 if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2444 { assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2444, __extension__ __PRETTY_FUNCTION__
); }))
; }
2445 }
2446 } ccv_nnc_graph_visit_endfor} }
2447 // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2448 // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2449 // that is not enforced in-place (because the tensor enforced in-place will be different than the
2450 // binding one).
2451 for (i = 0; i < tensor_bind_size; i++)
2452 {
2453 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2454 // If there is a tensor binded, then it is unassigned.
2455 if (resolved_symbol.d >= 0)
2456 {
2457 int d = resolved_symbol.d;
2458 // I cannot assert too much at this moment.
2459 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2460 d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2461 // This check is for in-place ops. Only in-place op could have unassigned but ref.
2462 // It has nothing to do with alias.
2463 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2464 d = tensor_blocks[d].ref - 1;
2465 // Doesn't work if this is a loop carrying variable.
2466 assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
__extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
__assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2466, __extension__ __PRETTY_FUNCTION__); }))
;
2467 tensor_blocks[d].flags = UNASSIGNED;
2468 tensor_blocks[d].ref = 0; // No need to have ref as well.
2469 }
2470 }
2471 // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2472 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2473 int x, y;
2474 for (x = 0; x < node->input_size; x++)
2475 {
2476 /* If the input is not assigned, it can be referenced, find the referenced one */
2477 int ref = node->inputs[x];
2478 if (ref < 0)
2479 continue;
2480 const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2481 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2482 ref = tensor_blocks[ref].ref - 1;
2483 assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2483, __extension__ __PRETTY_FUNCTION__
); }))
;
2484 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2485 tensor_blocks[ref].tail->rnum == 1)
2486 {
2487 for (y = 0; y < node->output_size; y++)
2488 /* Only proceed if the input symbol is different from the output symbol, */
2489 /* and the input symbol meets the output symbol exactly at the same spot. */
2490 if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2491 node->outputs[y] >= 0 &&
2492 ref != node->outputs[y] &&
2493 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
0x3) == UNASSIGNED))
)
2494 {
2495 const int node_output_y = node->outputs[y];
2496 const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2497 /* If dimension matches perfectly, then we can assign y_symbol to x.
2498 * If both of them are aliases, making sure their origin matches in size too. */
2499 if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
2500 {
2501 _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2502 // This refers to an alias itself, now mark it and will be processed later.
2503 if (ref != node->inputs[x])
2504 tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2505 }
2506 }
2507 }
2508 }
2509 } ccv_nnc_graph_visit_endfor} }
2510 // Specifically handle the bypass. This need to be done after the first pass.
2511 // I need to extend the bypass life-time to the same as the one I am going with.
2512 // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2513 ccv_nnc_tensor_block_t empty_block = {};
2514 empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2515 empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2516 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2517 if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2518 {
2519 int can_bypass = 1;
2520 for (i = 0; can_bypass && i < node->output_size; i++)
2521 {
2522 int d = node->outputs[i];
2523 if (d < 0)
2524 continue;
2525 if (!tensor_blocks[d].bypass_ref)
2526 continue;
2527 while (tensor_blocks[d].ref)
2528 d = tensor_blocks[d].ref - 1;
2529 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2530 while (tensor_blocks[bypass_ref].ref)
2531 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2532 // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2533 if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2534 continue;
2535 ccv_array_clear(empty_block.head);
2536 for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2537 ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
+ (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j)))
);
2538 ccv_array_clear(empty_block.tail);
2539 for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2540 ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
+ (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j)))
);
2541 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2542 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, empty_block);
2543 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2544 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, empty_block);
2545 // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2546 assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2546, __extension__ __PRETTY_FUNCTION__
); }))
;
2547 int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2548 while (tensor_blocks[b_ref].ref)
2549 b_ref = tensor_blocks[b_ref].ref - 1;
2550 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2551 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2552 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2553 // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2554 can_bypass = can_bypass && (a_hop_b || b_hop_a);
2555 }
2556 if (can_bypass)
2557 {
2558 for (i = 0; i < node->output_size; i++)
2559 {
2560 int d = node->outputs[i];
2561 if (d < 0)
2562 continue;
2563 if (!tensor_blocks[d].bypass_ref)
2564 continue;
2565 while (tensor_blocks[d].ref)
2566 d = tensor_blocks[d].ref - 1;
2567 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2568 while (tensor_blocks[bypass_ref].ref)
2569 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2570 // The bypass_ref can extend its life-time.
2571 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2572 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2573 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2574 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2575 }
2576 } else {
2577 for (i = 0; i < node->output_size; i++)
2578 tensor_blocks[node->outputs[i]].bypass_ref = 0;
2579 const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2580 // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2581 exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2582 }
2583 }
2584 } ccv_nnc_graph_visit_endfor} }
2585 ccv_array_free(empty_block.head);
2586 ccv_array_free(empty_block.tail);
2587 *r_exec_dep = exec_dep;
2588 *r_tensor_blocks = tensor_blocks;
2589}
2590
2591static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2592{
2593 if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2594 {
2595 ccv_nnc_cmd_t retval = cmd;
2596 retval.cmd = CCV_NNC_NOOP;
2597 return retval;
2598 }
2599 return cmd;
2600}
2601
2602static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2603{
2604 if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2605 {
2606 if (tensor_symbol_info[input].alias_ref)
2607 {
2608 const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2609 assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2609, __extension__ __PRETTY_FUNCTION__
); }))
;
2610 ccv_nnc_tensor_symbol_t tensor_symbol = {};
2611 if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2612 {
2613 tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2614 if (tensor_symbol_info[alias_ref].pair_ref)
2615 ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2616 .d = tensor_symbol_info[alias_ref].pair_ref - 1,
2617 .graph = dup_graph->pair
2618 });
2619 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2620 dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2621 } else {
2622 tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2623 tensor_symbol.graph = dup_graph;
2624 }
2625 ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2626 if (tensor_symbol_info[input].pair_ref)
2627 ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2628 .d = tensor_symbol_info[input].pair_ref - 1,
2629 .graph = dup_graph->pair
2630 });
2631 ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2632 dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2633 } else {
2634 ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2635 if (tensor_symbol_info[input].pair_ref)
2636 ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2637 .d = tensor_symbol_info[input].pair_ref - 1,
2638 .graph = dup_graph->pair
2639 });
2640 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2641 dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2642 }
2643 if (tensor_symbol_info[input].bypass_ref)
2644 {
2645 const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2646 assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2646, __extension__ __PRETTY_FUNCTION__
); }))
;
2647 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])))
;
2648 symbol_info->bypass_ref = dup_bypass_ref + 1;
2649 }
2650 }
2651 return (ccv_nnc_tensor_symbol_t) {
2652 .d = dup_tensor_block_ref[input * unroll_count],
2653 .graph = dup_graph,
2654 };
2655}
2656
2657static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2658{
2659 int i;
2660 if (dup_exec_ref[idx * unroll_count] < 0)
2661 {
2662 // Input has to come before output, because output could has a bypass reference to the input.
2663 for (i = 0; i < node->input_size; i++)
2664 max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2665 for (i = 0; i < node->output_size; i++)
2666 max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2667 ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2668 dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2669 }
2670 return (ccv_nnc_graph_exec_symbol_t) {
2671 .d = dup_exec_ref[idx * unroll_count],
2672 .graph = dup_graph,
2673 };
2674}
2675
2676static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2677{
2678 int i;
2679 for (i = 0; i < tensor_block_size; i++)
2680 {
2681 if (tensor_blocks[i].head)
2682 ccv_array_free(tensor_blocks[i].head);
2683 if (tensor_blocks[i].tail)
2684 ccv_array_free(tensor_blocks[i].tail);
2685 if (tensor_blocks[i].r_refs)
2686 ccv_array_free(tensor_blocks[i].r_refs);
2687 if (tensor_blocks[i].dup_p_refs)
2688 ccv_array_free(tensor_blocks[i].dup_p_refs);
2689 }
2690 ccfreefree(tensor_blocks);
2691}
2692
2693// Find tensors that cannot be solved by co-allocating to the same location.
2694static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2695{
2696 int i, j, unroll_count = 0;
2697 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2698 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2699 {
2700 // This is is a parameter, thus, it has to be either an alias or used.
2701 assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
& 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2701, __extension__ __PRETTY_FUNCTION__
); }))
;
2702 const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2703 // The parameter it assign to has to be either an alias or used.
2704 assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2704, __extension__ __PRETTY_FUNCTION__
); }))
;
2705 // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2706 // If it is the same, we are good, no need to extend.
2707 int a_ref = i;
2708 while (tensor_blocks[a_ref].ref)
2709 a_ref = tensor_blocks[a_ref].ref - 1;
2710 int b_ref = assign_ref;
2711 while (tensor_blocks[b_ref].ref)
2712 b_ref = tensor_blocks[b_ref].ref - 1;
2713 if (a_ref != b_ref)
2714 {
2715 // If any of the b's head is deterministically later than a's tail
2716 // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2717 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2718 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2719 // It cannot be that both i can hop to j can j can hop to i.
2720 assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
> 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2720, __extension__ __PRETTY_FUNCTION__
); }))
;
2721 // Can it be folded
2722 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2723 if (a_hop_b || b_hop_a)
2724 {
2725 tensor_blocks[a_ref].companion_ref = b_ref + 1;
2726 tensor_blocks[b_ref].companion_ref = a_ref + 1;
2727 continue;
2728 }
2729 int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2730 for (j = 0; c_ref >= 0; j++)
2731 {
2732 while (tensor_blocks[c_ref].ref)
2733 c_ref = tensor_blocks[c_ref].ref - 1;
2734 c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2735 }
2736 unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
= (j + 1); (_a > _b) ? _a : _b; })
;
2737 }
2738 }
2739 // Reset companion_ref if need to unroll.
2740 if (unroll_count)
2741 for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2742 tensor_blocks[j].companion_ref = 0;
2743 return unroll_count;
2744}
2745
2746static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2747{
2748 int i, j, n;
2749 // The inout exec nodes, these are the nodes we are going to extend.
2750 uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2751 int max_input_size = 0;
2752 int max_output_size = 0;
2753 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2754 {
2755 max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
;
2756 max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
;
2757 }
2758 ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
2759 ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
2760 // Doing graph expansion
2761 // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2762 assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2762, __extension__ __PRETTY_FUNCTION__
); }))
;
2763 assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2763, __extension__ __PRETTY_FUNCTION__
); }))
;
2764#define INCOMING_NODE (1)
2765#define OUTGOING_NODE (2)
2766 // Unroll the graph n times.
2767 for (n = 0; n < unroll_count; n++)
2768 {
2769 int* const dup_exec_ref = r_dup_exec_ref + n;
2770 const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2771 int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2772 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2773 dup_exec_ref[i * unroll_count] = -1;
2774 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2775 {
2776 // If there is a assign_ref, that means I don't need to dup the tensor.
2777 if (tensor_symbol_info[i].assign_ref)
2778 {
2779 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2780 dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2781 } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2782 // If this is a read-only tensor block, no need to duplicate because the value never changes
2783 // (note we handled assign_ref first), therefore, no need to generate duplicate.
2784 dup_tensor_block_ref[i * unroll_count] = i;
2785 else
2786 dup_tensor_block_ref[i * unroll_count] = -1;
2787 }
2788 // Go through the original graph, make copies of the node if it is inout.
2789 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2790 ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2791 inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2792 if (!node->outgoings)
2793 continue;
2794 for (i = 0; i < node->outgoings->rnum; i++)
2795 {
2796 const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
2797 inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2798 ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2799 ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2800 }
2801 } ccv_nnc_graph_visit_endfor} }
2802 // Check the visitor are all marked as either incoming or outgoing.
2803 const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2804 const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2805 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2806 {
2807 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2808 continue;
2809 assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2809, __extension__ __PRETTY_FUNCTION__
); }))
;
2810 // If this is pure incoming nodes, then I need to concat this one with all original destination node
2811 if (inout[i] == INCOMING_NODE)
2812 for (j = 0; j < dup_destination_size; j++)
2813 {
2814 ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2815 .d = dup_destinations[j].d,
2816 .graph = dup_graph,
2817 }, (ccv_nnc_graph_exec_symbol_t) {
2818 .d = dup_exec_ref[i * unroll_count],
2819 .graph = dup_graph,
2820 });
2821 }
2822 }
2823 if (dup_graph->destinations)
2824 ccv_array_clear(dup_graph->destinations);
2825 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2826 {
2827 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2828 continue;
2829 const int d = dup_exec_ref[i * unroll_count];
2830 ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
+ (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)))
;
2831 // If this has no outgoing node, add to the destination.
2832 if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2833 ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2834 .graph = dup_graph,
2835 .d = d,
2836 });
2837 }
2838 }
2839#undef INCOMING_NODE
2840#undef OUTGOING_NODE
2841 ccfreefree(inout);
2842}
2843
2844static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2845{
2846 int i;
2847 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2848 // Now can assign them (The dup) as companion.
2849 // Get to the last one, which we will wrap over.
2850 if (dup_tensor_symbol_info[i].assign_ref)
2851 {
2852 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2853 dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2854 assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2854, __extension__ __PRETTY_FUNCTION__
); }))
;
2855 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2856 }
2857}
2858
2859// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2860// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2861// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2862static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2863{
2864 int i, j, k;
2865 for (i = 0; i < p_node_info->output_size; i++)
2866 {
2867 const int d = p_node_info->outputs[i];
2868 const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx)))
- 1;
2869 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
!((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED))
)
2870 continue;
2871 for (k = 0; k < destination_size; k++)
2872 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2873 // Add the duplicated destinations to the tensor_block_ref.
2874 for (j = 0; j < unroll_count; j++)
2875 for (k = 0; k < destination_size; k++)
2876 {
2877 const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2878 const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2879 if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2880 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2881 }
2882 }
2883}
2884
2885static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2886{
2887 int i, j;
2888 ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2889 ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2890 // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2891 // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2892 // No need to change anything, we are good.
2893 const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2894 if (!unroll_count)
2895 return;
2896 // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2897 // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2898 ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2899 int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2900 int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2901 _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2902 ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2903 ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2904 ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
(dup_graph->exec_symbol_info->rnum) + _incoming_edges_ >
1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
= (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = {
(dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1
; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for
(_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
_idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue
; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r =
1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) {
((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph
->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1
; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_
[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((
_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum
)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->destinations->rnum); _exist_size_[1] =
0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
= 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
6 && _d_ < (dup_graph->destinations->rnum))
{ _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
< ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 6 && _d_ <
(dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0
), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->
rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
<= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2904, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
2905 ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2906 _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2907 // Free out the old exec_dep
2908 ccv_matrix_free(exec_dep);
2909 // and the tensor blocks, prepare for the new.
2910 _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2911 // A reverse map to find where the original tensor comes from.
2912 int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2913 for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2914 dup_tensor_from_ref[i] = -1;
2915 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2916 for (j = 0; j < unroll_count; j++)
2917 if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2918 dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2919 int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2920 for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2921 dup_exec_from_ref[i] = -1;
2922 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2923 {
2924 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2925 continue;
2926 dup_exec_from_ref[i] = i; // Reference back.
2927 for (j = 0; j < unroll_count; j++)
2928 if (dup_exec_ref[i * unroll_count + j] >= 0)
2929 dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2930 }
2931 // Reset all attr.
2932 memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2933 _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2934 ccv_nnc_graph_visit_free(dup_visit);
2935 ccfreefree(dup_exec_symbol_info);
2936 ccfreefree(dup_exec_from_ref);
2937 ccfreefree(dup_tensor_from_ref);
2938 // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2939 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2940 // Loop over all possible duplications to assign dup_p_ref properly.
2941 for (j = 0; j < unroll_count; j++)
2942 {
2943 const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2944 if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2945 {
2946 const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2947 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2948 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2949 {
2950 if (!tensor_blocks[dup_idx].dup_p_refs)
2951 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2952 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2953 }
2954 if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2955 continue;
2956 const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2957 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2958 if (p_ref_1_is_in_or_out == 1)
2959 {
2960 if (!tensor_blocks[dup_idx].dup_p_refs)
2961 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2962 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2963 }
2964 }
2965 }
2966 // companion_ref
2967 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2968 // Now can assign them (The dup) as companion.
2969 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2970 {
2971 // Get to the last one, which we will wrap over.
2972 const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2973 if (assign_ref >= 0)
2974 {
2975 int b_ref = assign_ref;
2976 while (tensor_blocks[b_ref].ref)
2977 b_ref = tensor_blocks[b_ref].ref - 1;
2978 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2979 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2980 // It cannot be that both i can hop to j can j can hop to i.
2981 // And it can be hop from one to another now after duplication.
2982 assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2982, __extension__ __PRETTY_FUNCTION__); }))
;
2983 tensor_blocks[i].companion_ref = b_ref + 1;
2984 tensor_blocks[b_ref].companion_ref = i + 1;
2985 }
2986 }
2987 ccfreefree(dup_tensor_symbol_info);
2988 // Extend the dup tensor block ref, prepare for future extensions.
2989 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2990 for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2991 dup_tensor_block_ref[i] = -1;
2992 // Assign out changed properties.
2993 *r_exec_dep = exec_dep;
2994 *r_tensor_blocks = tensor_blocks;
2995 *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2996 *r_dup_graph = dup_graph;
2997 *r_unroll_count = unroll_count;
2998 *r_dup_exec_ref = dup_exec_ref;
2999 *r_dup_tensor_block_ref = dup_tensor_block_ref;
3000}
3001
3002static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
3003{
3004 if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
3005 return tensor_block_size;
3006 int i;
3007 const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
3008 int found_idx = tensor_block_size;
3009 for (i = 0; i < anonymous_block_free_list_cap; i++)
3010 {
3011 const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)))
;
3012 assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 3012, __extension__ __PRETTY_FUNCTION__
); }))
;
3013 // If the type doesn't match, ignore.
3014 if (tensor_blocks[idx].type != type)
3015 continue;
3016 // Heuristic about how to select the best tensor block to move forward.
3017 // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
3018 if (tensor_blocks[idx].size >= size)
3019 {
3020 if (no_dup_p_refs)
3021 return idx;
3022 // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
3023 // then we cannot do better than this, if that is the case, just return.
3024 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
3025 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
3026 return idx;
3027 }
3028 int64_t found_idx_size_diff;
3029 int64_t idx_size_diff;
3030 if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
3031 // Now, compare whether this one or the found_idx one is better.
3032 // At this point, there is no point of comparing the dup_p_refs, we only care about which one
3033 // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
3034 (found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3035 {
3036 found_idx = idx;
3037 continue;
3038 }
3039 // No need to update if found_idx is better than idx.
3040 if (found_idx_size_diff > idx_size_diff)
3041 continue;
3042 // We bias towards the bigger one in case of similar.
3043 if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3044 {
3045 found_idx = idx;
3046 continue;
3047 }
3048 assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
== tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 3048, __extension__ __PRETTY_FUNCTION__
); }))
;
3049 // On a tie, check which one has tighter life-cycle.
3050 if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3051 {
3052 // Check whether the current tensor blocks life-cycle is longer than the previous one.
3053 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3054 (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3055 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3056 found_idx = idx;
3057 continue;
3058 }
3059 // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3060 // We prefer to choose the one that has life-cycle closer to the expected ones.
3061 if (no_dup_p_refs)
3062 {
3063 // Whoever is shorter wins.
3064 if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3065 (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3066 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3067 found_idx = idx;
3068 continue;
3069 }
3070 if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3071 continue;
3072 if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3073 {
3074 found_idx = idx;
3075 continue;
3076 }
3077 // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3078 const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3079 const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3080 if (idx_after_request && found_idx_after_request)
3081 {
3082 if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3083 found_idx = idx;
3084 continue;
3085 } else {
3086 // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3087 // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3088 // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3089 if (!found_idx_after_request && (idx_after_request ||
3090 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3091 found_idx = idx;
3092 continue;
3093 }
3094 }
3095 return found_idx;
3096}
3097
3098static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3099{
3100 if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3101 return 0;
3102 int i, j, k;
3103 int input_size = 0;
3104 for (i = 0; i < p_node_info->p_while.input_size; i++)
3105 if (p_node_info->p_while.inputs[i] >= 0)
3106 ++input_size;
3107 // If doesn't have tensor inputs (thus, only special inputs), just return.
3108 if (!input_size)
3109 return 0;
3110 ccv_nnc_tensor_symbol_t inputs[input_size];
3111 input_size = 0;
3112 for (i = 0; i < p_node_info->p_while.input_size; i++)
3113 if (p_node_info->p_while.inputs[i] >= 0)
3114 inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3115 .d = p_node_info->p_while.inputs[i],
3116 .graph = symbolic_graph,
3117 };
3118 assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
> 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3118, __extension__ __PRETTY_FUNCTION__
); }))
;
3119 ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3120 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3121 for (i = 0; i < symbolic_graph->breakpoint_size; i++)
3122 {
3123 // Make a noop copy of the breakpoint, but with some tensor inputs.
3124 ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
3125 ccv_array_push(dup_breakpoints, &noop);
3126 // Connect this noop to the outgoing nodes of breakpoints.
3127 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(symbolic_graph->breakpoints[i].d)))
;
3128 if (symbol_info->outgoings)
3129 for (j = 0; j < symbol_info->outgoings->rnum; j++)
3130 {
3131 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
3132 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3133 .d = d,
3134 .graph = symbolic_graph,
3135 });
3136 }
3137 }
3138 for (i = 0; i < exec_symbol_info_size; i++)
3139 {
3140 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(i)))
;
3141 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
3142 continue;
3143 if (symbol_info->outgoings)
3144 {
3145 const int outgoing_size = symbol_info->outgoings->rnum;
3146 for (j = 0; j < outgoing_size; j++)
3147 {
3148 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
3149 for (k = 0; k < symbolic_graph->breakpoint_size; k++)
3150 if (d == symbolic_graph->breakpoints[k].d)
3151 {
3152 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)))
;
3153 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3154 .d = i,
3155 .graph = symbolic_graph,
3156 }, noop);
3157 // Found, connected, exit.
3158 break;
3159 }
3160 }
3161 }
3162 }
3163 // Add the dup_breakpoints to source if neccessary.
3164 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3164, __extension__ __PRETTY_FUNCTION__
); }))
;
3165 const int source_size = symbolic_graph->sources->rnum;
3166 for (i = 0; i < source_size; i++)
3167 {
3168 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i)))
)->d;
3169 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3170 if (d == symbolic_graph->breakpoints[j].d)
3171 {
3172 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
3173 ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3174 // Found, made, exit.
3175 break;
3176 }
3177 }
3178 // Add the dup_breakpoints to destination if neccessary.
3179 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3179, __extension__ __PRETTY_FUNCTION__); }))
;
3180 const int destination_size = symbolic_graph->destinations->rnum;
3181 for (i = 0; i < destination_size; i++)
3182 {
3183 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i)))
)->d;
3184 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3185 if (d == symbolic_graph->breakpoints[j].d)
3186 {
3187 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
3188 ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3189 // Found, made, exit.
3190 break;
3191 }
3192 }
3193 return dup_breakpoints;
3194}
3195
3196// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3197static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3198{
3199 assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3199, __extension__ __PRETTY_FUNCTION__
); }))
;
3200 assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3200, __extension__ __PRETTY_FUNCTION__
); }))
;
3201 // First, fill all the "auto" holes.
3202 // This is the symbol table that with "auto" info filled up.
3203 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3204 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3205 ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3206 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
= (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3206, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
} _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
__assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3206, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
_exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3206, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
_d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
_idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3206, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
3207 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3208 int i, j, k, p, q;
3209 const ccv_nnc_graph_exec_symbol_info_t* const p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
3210 ccv_sparse_matrix_t* exec_dep;
3211 ccv_nnc_tensor_block_t* tensor_blocks;
3212 _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3213 int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3214 // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3215 // are automatically filled in, and all the sub-graphs are processed.
3216 // There is a last step though, for a while loop, it is parameterized:
3217 // while (x > 5) {
3218 // y = x + 1;
3219 // } (y => x) // This means after this loop is done, y's value will be copied over to x.
3220 // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3221 // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3222 // it is a inplace operation.
3223 // But if y cannot be x's alias, for example, this while loop looks like this:
3224 // while (x > 5) {
3225 // y = x + a
3226 // b = x + y
3227 // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3228 // For this example, y cannot be x's alias because x is used later to compute b (and that computation
3229 // has dependency on y as well).
3230 // For this case, we need to modify the computation graph. Previously, the graph looks like this:
3231 // y = x + a -> b = x + y
3232 // This graph will be extended to look like this:
3233 // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3234 // while (x0 > 5) {
3235 // y0 = x0 + a0
3236 // b0 = x0 + y0
3237 // if (y0 > 5) break
3238 // y1 = y0 + b0
3239 // b1 = y0 + y1
3240 // } (y1 => x0, b1 => a0)
3241 // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3242 // with each other now).
3243 // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3244 // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3245 ccv_nnc_symbolic_graph_t* dup_graph = 0;
3246 int* dup_exec_ref = 0;
3247 int* dup_tensor_block_ref = 0;
3248 int unroll_count = 0;
3249 // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3250 ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
3251 prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3252 prep->flags = 0;
3253 // Cannot handle dup a node that is a graph as well.
3254 if (p_exec_symbol_info)
3255 {
3256 prep->flags = p_node_info->flags;
3257 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3258 {
3259 _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3260 _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
, symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3261 } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3262 // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3263 }
3264 }
3265 ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
3266 ccv_array_t* anonymous_block_free_list = 0;
3267 const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3268 // Record whether this tensor is folded in this round.
3269 uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
3270 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
3271 for (p = 0; p < node->graph_ref_size; p++)
3272 {
3273 assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 3273, __extension__ __PRETTY_FUNCTION__); }))
;
3274 ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)))
;
3275 ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3276 ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0)))
, sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0)))
, sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3277 sub_prep->dup_breakpoints = dup_breakpoints;
3278 sub_prep->p = prep;
3279 sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1] = sub_prep;
3280 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3281 const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3282 for (i = 0; i < s_alloc_prep->block_size; i++)
3283 {
3284 const int block_ref = s_alloc_prep->blocks[i].block_ref;
3285 const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3286 if (block_ref < sub_prep->tensor_symbol_info_size)
3287 {
3288 // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3289 // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3290 if (s_tensor_blocks[block_ref].bypass_ref)
3291 {
3292 int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3293 while (s_tensor_blocks[bypass_ref].ref)
3294 bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3295 if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3296 s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3297 continue;
3298 }
3299 if (s_tensor_blocks[block_ref].p_refs[0])
3300 {
3301 /* If it is already properly assigned, next. */
3302 if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3303 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3304 {
3305 if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3306 s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3307 else {
3308 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3308, __extension__ __PRETTY_FUNCTION__
); }))
;
3309 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3310 }
3311 }
3312 /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3313 if (s_tensor_blocks[block_ref].p_refs[1] &&
3314 s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3315 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3316 {
3317 assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3317, __extension__ __PRETTY_FUNCTION__
); }))
;
3318 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3318, __extension__ __PRETTY_FUNCTION__
); }))
;
3319 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3320 }
3321 }
3322 } else if (s_tensor_blocks[block_ref].dup_p_refs) {
3323 /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3324 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3325 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3326 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3327 * its life-time to the end of the output tensor. */
3328 if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3329 s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3330 for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3331 ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j)))
);
3332 }
3333 }
3334 }
3335 const int init_tensor_block_size = tensor_block_size;
3336 int rw_anonymous_buffer_size_cap = 0;
3337 int ro_anonymous_buffer_size_cap = 0;
3338 if (anonymous_block_free_list)
3339 ccv_array_clear(anonymous_block_free_list);
3340 memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3341 for (p = 0; p < node->graph_ref_size; p++)
3342 {
3343 ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1];
3344 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3345 int rw_anonymous_buffer_size = 0;
3346 int ro_anonymous_buffer_size = 0;
3347 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3348 if (s_alloc_prep->buffers[i].p_refs[0])
3349 {
3350 /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3351 int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3352 /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3353 int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3354 assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3354, __extension__ __PRETTY_FUNCTION__
); }))
;
3355 int unref_p_ref_0 = p_ref_0;
3356 while (tensor_blocks[unref_p_ref_0].ref)
3357 unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3358 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3359 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3359, __extension__ __PRETTY_FUNCTION__); }))
;
3360 if (s_alloc_prep->buffers[i].p_refs[1])
3361 {
3362 int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3363 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3364 assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3364, __extension__ __PRETTY_FUNCTION__
); }))
;
3365 int unref_p_ref_1 = p_ref_1;
3366 while (tensor_blocks[unref_p_ref_1].ref)
3367 unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3368 /* See above comment for the similar p_ref_0 check. */
3369 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3369, __extension__ __PRETTY_FUNCTION__); }))
;
3370 assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3370, __extension__ __PRETTY_FUNCTION__
); }))
;
3371 int p_ref_t;
3372 if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3373 {
3374 CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
))
;
3375 CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t))
;
3376 }
3377 p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3378 /* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3379 if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
3380 {
3381 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3382 if (folded)
3383 {
3384 p_ref_0 = p_ref_1;
3385 unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3386 tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3387 for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3388 {
3389 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3390 assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3390, __extension__ __PRETTY_FUNCTION__
); }))
;
3391 }
3392 }
3393 }
3394 }
3395 /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3396 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3397 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3398 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3399 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3400 * associated with it, then we are good. */
3401 if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3402 (p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3403 (p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3404 TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3405 {
3406 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3407 { assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3407, __extension__ __PRETTY_FUNCTION__
); }))
; }
3408 /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3409 * is a long argument why that is the case, the digest is, it is much easier to control your output
3410 * than your input). */
3411 s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3412 s_alloc_prep->buffers[i].p_refs[1] = 0;
3413 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3414 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3414, __extension__ __PRETTY_FUNCTION__); }))
;
3415 tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
: _b; })
;
3416 for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3417 tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3418 tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3419 tensor_blocks[unref_p_ref_0].size;
3420 } else {
3421 s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3422 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3423 ++ro_anonymous_buffer_size;
3424 else
3425 rw_anonymous_buffer_size += unroll_count + 1;
3426 }
3427 } else {
3428 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3429 ++ro_anonymous_buffer_size;
3430 else
3431 rw_anonymous_buffer_size += unroll_count + 1;
3432 }
3433 if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3434 {
3435 const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3436 // All read-write buffer (potentially) can be reused between each case..of branch.
3437 rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3438 // Read-only buffer cannot be reused between each case..of branch.
3439 ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3440 /* Anonymous block, allocate additional tensor blocks for this. */
3441 /* This is either because this is an internal tensor (don't have p_ref) */
3442 /* or it is an anonymous block itself within the sub graphs of this while graph. */
3443 tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3444 memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3445 if (dup_tensor_block_ref)
3446 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3447 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3448 if (!s_alloc_prep->buffers[i].p_refs[0])
3449 {
3450 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3451 {
3452 assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size
+ rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size
+ rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap"
, "ccv_nnc_symbolic_graph_compile.c", 3452, __extension__ __PRETTY_FUNCTION__
); }))
;
3453 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0x10) | ANONYMOUS))
;
3454 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3455 tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3456 tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3457 tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3458 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3459 tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3460 ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3461 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3462 if (dup_p_refs && dup_p_refs->rnum > 0)
3463 {
3464 for (j = 0; j < dup_p_refs->rnum; j++)
3465 {
3466 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3467 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3467, __extension__ __PRETTY_FUNCTION__
); }))
;
3468 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3468, __extension__ __PRETTY_FUNCTION__
); }))
;
3469 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3469, __extension__ __PRETTY_FUNCTION__); }))
;
3470 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3471 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3472 if (tensor_symbol_info[dup_p_ref].p_ref)
3473 {
3474 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3475 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3475, __extension__ __PRETTY_FUNCTION__); }))
;
3476 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3477 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3478 {
3479 if (!tensor_blocks[tensor_block_size].dup_p_refs)
3480 tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3481 ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3482 }
3483 }
3484 if (!tensor_blocks[tensor_block_size].tail)
3485 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3486 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3487 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_size]);
3488 }
3489 } else {
3490 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3491 ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3492 }
3493 for (j = 0; j < source_size; j++)
3494 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3495 /* If this is a read-only (based on SSA, if first encountered as read), and this is
3496 * sub-graph. Mark it to the end of the graph. */
3497 if (p_exec_symbol_info)
3498 for (j = 0; j < destination_size; j++)
3499 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3500 /* If it is read-only, it is self-reflecting. */
3501 for (k = 0; k < unroll_count; k++)
3502 {
3503 for (j = 0; j < destination_size; j++)
3504 if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3505 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3506 /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3507 assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3507, __extension__ __PRETTY_FUNCTION__
); }))
;
3508 dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3509 }
3510 ++tensor_block_size;
3511 } else {
3512 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3513 const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3514 const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3515 // Find suitable tensor block from the free list.
3516 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS))
;
3517 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3518 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3519 if (new_anonymous_tensor_block)
3520 {
3521 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3522 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3523 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3524 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3525 ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3526 } else {
3527 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3528 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3529 }
3530 if (dup_p_refs && dup_p_refs->rnum > 0)
3531 {
3532 for (j = 0; j < dup_p_refs->rnum; j++)
3533 {
3534 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3535 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3535, __extension__ __PRETTY_FUNCTION__
); }))
;
3536 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3536, __extension__ __PRETTY_FUNCTION__
); }))
;
3537 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3538 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3539 if (tensor_symbol_info[dup_p_ref].p_ref)
3540 {
3541 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3542 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3542, __extension__ __PRETTY_FUNCTION__); }))
;
3543 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3544 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3545 {
3546 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3547 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3548 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3549 }
3550 }
3551 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3551, __extension__ __PRETTY_FUNCTION__); }))
;
3552 if (!tensor_blocks[tensor_block_idx].tail)
3553 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3554 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3555 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_idx]);
3556 // We have to add it to the warp around companion_ref as well.
3557 // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3558 // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3559 // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3560 // gaurantee may be broken down in the line.
3561 if (tensor_blocks[dup_p_ref].companion_ref)
3562 {
3563 const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3564 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3565 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3566 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3567 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3568 }
3569 }
3570 } else if (new_anonymous_tensor_block) {
3571 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3572 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3573 }
3574 const int prev_tensor_block_idx = tensor_block_idx;
3575 if (new_anonymous_tensor_block)
3576 {
3577 if (!anonymous_block_free_list)
3578 anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3579 ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3580 ++tensor_block_size;
3581 }
3582 for (k = 0; k < unroll_count; k++)
3583 {
3584 const int tensor_block_idx = new_anonymous_tensor_block ?
3585 (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3586 dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3587 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS))
;
3588 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3589 if (new_anonymous_tensor_block)
3590 {
3591 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3592 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3593 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3594 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3595 /* Attach to duplicated exec for this tensor block. */
3596 ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3597 } else {
3598 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3599 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3600 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3601
3602 }
3603 if (dup_p_refs && dup_p_refs->rnum > 0)
3604 {
3605 /* Not nil, not self-reflecting. */
3606 for (j = 0; j < dup_p_refs->rnum; j++)
3607 {
3608 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3609 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3609, __extension__ __PRETTY_FUNCTION__
); }))
;
3610 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3610, __extension__ __PRETTY_FUNCTION__
); }))
;
3611 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3612 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3613 if (tensor_symbol_info[dup_p_ref].p_ref)
3614 {
3615 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3616 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3616, __extension__ __PRETTY_FUNCTION__); }))
;
3617 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3618 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3619 {
3620 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3621 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3622 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3623 }
3624 }
3625 assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3625, __extension__ __PRETTY_FUNCTION__
); }))
;
3626 const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3627 assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
__extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
__assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3627, __extension__ __PRETTY_FUNCTION__); }))
;
3628 if (!tensor_blocks[tensor_block_idx].tail)
3629 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3630 for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3631 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3632 // We have to add it to the warp around companion_ref as well.
3633 if (tensor_blocks[dup_dup_p_ref].companion_ref)
3634 {
3635 const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3636 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3637 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3638 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3639 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3640 }
3641 }
3642 } else if (new_anonymous_tensor_block) {
3643 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3644 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3645 }
3646 if (new_anonymous_tensor_block)
3647 ++tensor_block_size;
3648 }
3649 }
3650 }
3651 }
3652 }
3653 } ccv_nnc_graph_visit_endfor} }
3654 if (anonymous_block_free_list)
3655 ccv_array_free(anonymous_block_free_list);
3656 ccfreefree(tensor_fold);
3657 // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3658 // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3659 ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3660 prep->while_count_tensor = 0;
3661 prep->dup_breakpoints = 0;
3662 prep->p = 0;
3663 prep->symbolic_graph = symbolic_graph;
3664 prep->p_idx = symbolic_graph->p_idx;
3665 prep->exec_idx = symbolic_graph->exec_idx;
3666 prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3667 prep->sub_preps = sub_preps;
3668 prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3669 prep->exec_symbol_info = exec_symbol_info;
3670 prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3671 prep->tensor_symbol_info = tensor_symbol_info;
3672 prep->unroll_count = unroll_count;
3673 prep->dup_tensor_block_ref = dup_tensor_block_ref;
3674 prep->tensor_block_size = tensor_block_size;
3675 prep->tensor_blocks = tensor_blocks;
3676 prep->exec_flags = exec_flags;
3677 prep->visit = visit;
3678 prep->alloc_prep = alloc_prep;
3679 if (dup_graph)
3680 ccv_nnc_symbolic_graph_free(dup_graph);
3681 if (dup_exec_ref)
3682 ccfreefree(dup_exec_ref);
3683 return prep;
3684}
3685
3686static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3687{
3688 int i;
3689 _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3690 ccfreefree(prep->exec_flags);
3691 for (i = 0; i < prep->sub_prep_size; i++)
3692 if (prep->sub_preps[i])
3693 _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3694 if (prep->sub_preps)
3695 ccfreefree(prep->sub_preps);
3696 ccfreefree(prep->tensor_symbol_info);
3697 ccfreefree(prep->exec_symbol_info);
3698 if (prep->dup_tensor_block_ref)
3699 ccfreefree(prep->dup_tensor_block_ref);
3700 _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3701 ccv_nnc_graph_visit_free(prep->visit);
3702 ccfreefree(prep);
3703}
3704
3705static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3706{
3707 int i, j;
3708 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
3709 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3710 {
3711 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3712 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3712, __extension__ __PRETTY_FUNCTION__
); }))
;
3713 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3714 for (i = 0; i < node->p_while.input_size; i++)
3715 if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3716 {
3717 ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3718 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3719 for (j = 0; j < d; j++)
3720 prep = prep->p;
3721 prep->while_count_tensor = 1;
3722 }
3723 }
3724 for (i = 0; i < node->graph_ref_size; i++)
3725 {
3726 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3727 if (graph_ref >= 0)
3728 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3729 }
3730 } ccv_nnc_graph_visit_endfor} }
3731}
3732
3733static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3734{
3735 if (symbol >= 0)
3736 return graph_prep->tensor_arena->vt_tensors[symbol];
3737 if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3738 return 0;
3739 assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
: 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3739, __extension__ __PRETTY_FUNCTION__
); }))
;
3740 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3741 int i;
3742 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3743 for (i = 0; i < d; i++)
3744 prep = prep->p;
3745 assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3745, __extension__ __PRETTY_FUNCTION__
); }))
;
3746 return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3747}
3748
3749static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3750{
3751 int i;
3752 int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3753 ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3754 graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3755 graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3756 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3757 for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3758 if (graph_execs[i].graph == graph)
3759 graph_execs[i].d = exec_cvt[graph_execs[i].d];
3760 ccfreefree(exec_cvt);
3761}
3762
3763static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3764{
3765 int i, j, k;
3766 ccv_nnc_graph_t* const graph = graph_prep->graph;
3767 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3768 ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
1
Uninitialized value stored to field 'graph'
3769 graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3770 graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3771 graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3772 graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3773 memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3774 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3775 int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3776 for (i = 0; i < exec_symbol_info_size; i++)
2
Assuming 'i' is >= 'exec_symbol_info_size'
3
Loop condition is false. Execution continues on line 3785
3777 {
3778 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; })
;
3779 max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; })
;
3780 if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3781 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
: _b; })
;
3782 graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3783 graph_execs[i].graph = 0;
3784 }
3785 for (i = 0; i < graph_prep->sub_prep_size; i++)
4
Assuming 'i' is >= field 'sub_prep_size'
5
Loop condition is false. Execution continues on line 3787
3786 max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; })
;
3787 ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
6
'?' condition is true
3788 ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
7
'?' condition is true
3789 ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })
];
8
'?' condition is true
3790 const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3791 const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3792 // Create node, this is in topological order.
3793 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
9
Assuming '_i_' is < field 'size'
10
Loop condition is true. Entering loop body
3794 if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
11
The left operand of '==' is a garbage value
3795 {
3796 for (i = 0; i < node->input_size; i++)
3797 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3798 for (i = 0; i < node->output_size; i++)
3799 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3800 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3801 {
3802 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3803 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3803, __extension__ __PRETTY_FUNCTION__
); }))
;
3804 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3805 ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3806 graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3807 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3808 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3809 ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3810 for (i = 0; i < node->p_while.input_size; i++)
3811 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3812 for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3813 max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3814 ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3815 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3816 } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3817 for (i = 0; i < node->output_size; i++)
3818 if (max_outputs[i] && max_outputs[i]->alias_ref)
3819 max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3820 graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3821 // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3822 for (i = 0; i < node->case_of.argument.offset; i++)
3823 {
3824 ccv_nnc_tensor_t* const update = max_inputs[i];
3825 if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3826 continue;
3827 int flag = 0;
3828 for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3829 flag = (update == max_inputs[j]);
3830 if (!flag)
3831 ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3832 }
3833 const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3834 ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3835 if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3836 {
3837 // Add another graph for data transfer.
3838 ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3839 for (i = 0; i < node->output_size; i++)
3840 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3841 ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
, max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
);
3842 ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3843 ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3844 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3845 int exec_cvt;
3846 ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3847 }
3848 for (i = 0; i < node->graph_ref_size; i++)
3849 {
3850 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3851 if (graph_ref < 0)
3852 continue;
3853 ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3854 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3855 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3856 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3857 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3858 }
3859 } else {
3860 graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3861 }
3862 ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3863 }
3864 } ccv_nnc_graph_visit_endfor} }
3865 // Then connect them.
3866 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
3867 if (node->outgoings)
3868 for (i = 0; i < node->outgoings->rnum; i++)
3869 {
3870 const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
3871 if (graph_execs[outgoing].graph)
3872 ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3873 }
3874 } ccv_nnc_graph_visit_endfor} }
3875 int source_exec_created = 0;
3876 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3877 const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3878 ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3879 // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3880 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
3881 {
3882 if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
3883 {
3884 int ref = i;
3885 while (tensor_symbol_info[ref].alias_ref)
3886 ref = tensor_symbol_info[ref].alias_ref - 1;
3887 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
3888 ref = tensor_blocks[ref].ref - 1;
3889 // This is not computable. It could be that we marked a const tensor as init zero.
3890 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
)
3891 continue;
3892 // If this tensor is not used by any exec, we don't need to init at all. Skip.
3893 if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3894 continue;
3895 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3896 // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3897 ccv_nnc_graph_exec_t set_exec;
3898 if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3899 set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={0,}}}, 0)
, ccv_nnc_no_hint, 0, 0, &tensor, 1);
3900 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3901 set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={1,}}}, 0)
, ccv_nnc_no_hint, 0, 0, &tensor, 1);
3902 for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
3903 {
3904 const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)))
;
3905 if (outgoing >= exec_symbol_info_size)
3906 continue;
3907 assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3907, __extension__ __PRETTY_FUNCTION__
); }))
;
3908 assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3908, __extension__ __PRETTY_FUNCTION__
); }))
;
3909 ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3910 }
3911 int flags = 0;
3912 if (alloc_dep[ref])
3913 for (j = 0; j < alloc_dep[ref]->rnum; j++)
3914 {
3915 const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)))
;
3916 // This is from alloc_dep, it should be computable.
3917 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3917, __extension__ __PRETTY_FUNCTION__
); }))
;
3918 if (tensor_blocks[d].tail)
3919 for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
3920 {
3921 const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
3922 if (incoming >= exec_symbol_info_size)
3923 continue;
3924 assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3924, __extension__ __PRETTY_FUNCTION__
); }))
;
3925 assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3925, __extension__ __PRETTY_FUNCTION__
); }))
;
3926 ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3927 flags = 1;
3928 }
3929 }
3930 // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3931 if (!flags)
3932 {
3933 if (!source_exec_created)
3934 {
3935 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3936 source_exec_created = 1;
3937 }
3938 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3939 }
3940 }
3941 }
3942 // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3943 // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3944 // with its alias).
3945 assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3945, __extension__ __PRETTY_FUNCTION__
); }))
;
3946 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3947 {
3948 ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3949 // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3950 if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3951 {
3952 const ccv_array_t* const head = tensor_blocks[i].head;
3953 if (head && head->rnum > 0)
3954 for (j = 0; j < head->rnum; j++)
3955 {
3956 const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(j)))
;
3957 if (idx >= exec_symbol_info_size)
3958 continue;
3959 assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3959, __extension__ __PRETTY_FUNCTION__); }))
;
3960 const int d = graph_execs[idx].d;
3961 ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)))
;
3962 int flag = 0;
3963 if (exec_info->tensor_wraps_ref)
3964 {
3965 ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)))
;
3966 for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3967 flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3968 }
3969 // If none is in the flag, it need to be included in the cast.
3970 if (!flag)
3971 ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3972 }
3973 }
3974 }
3975 // Create source / destination phony node. This is to facilitate use of compiled graph.
3976 // Also, this is needed if you have init zero execs.
3977 if (source_exec_created || source_size > 1)
3978 {
3979 if (!source_exec_created)
3980 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3981 for (i = 0; i < source_size; i++)
3982 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3983 } else {
3984 assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3984, __extension__ __PRETTY_FUNCTION__
); }))
;
3985 assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3985, __extension__ __PRETTY_FUNCTION__
); }))
;
3986 graph_exec_arena->source = graph_execs[sources[0].d];
3987 }
3988 if (destination_size == 1)
3989 graph_exec_arena->destination = graph_execs[destinations[0].d];
3990 else {
3991 graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3992 for (i = 0; i < destination_size; i++)
3993 ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3994 }
3995 ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3996 ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3997 return graph_exec_arena;
3998}
3999
4000static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
4001{
4002 if (graph_prep->symbolic_graph == pair)
4003 return graph_prep->graph;
4004 int i;
4005 for (i = 0; i < graph_prep->sub_prep_size; i++)
4006 if (graph_prep->sub_preps[i])
4007 {
4008 ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
4009 if (graph)
4010 return graph;
4011 }
4012 return 0;
4013}
4014
4015static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4016{
4017 int i;
4018 for (i = 0; i < graph_prep->sub_prep_size; i++)
4019 if (graph_prep->sub_preps[i])
4020 {
4021 if (graph_prep->sub_preps[i]->symbolic_graph->pair)
4022 graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
4023 }
4024}
4025
4026static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4027{
4028 assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4028, __extension__ __PRETTY_FUNCTION__
); }))
;
4029 int i;
4030 for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
4031 {
4032 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
)
)
4033 continue;
4034 if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref)
4035 {
4036 ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4037 .d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4038 .graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph,
4039 });
4040 if (pair_exec.d >= 0)
4041 ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4042 }
4043 }
4044 for (i = 0; i < graph_prep->sub_prep_size; i++)
4045 if (graph_prep->sub_preps[i])
4046 _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4047}
4048
4049static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4050{
4051 int i;
4052 if (graph_prep->dup_breakpoints)
4053 {
4054 // Strip the const modifier only possible because it is a sub-graph.
4055 ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4056 for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
4057 ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
+ (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i)))
);
4058 ccv_array_free(graph_prep->dup_breakpoints);
4059 graph_prep->dup_breakpoints = 0;
4060 graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4061 // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4062 memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0)))
, sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4063 // Since exec_symbol_info changed, create a new visit object.
4064 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 4064, __extension__ __PRETTY_FUNCTION__
); }))
;
4065 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 4065, __extension__ __PRETTY_FUNCTION__); }))
;
4066 ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)))
;
4067 const int source_size = symbolic_graph->sources->rnum;
4068 ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
;
4069 const int destination_size = symbolic_graph->destinations->rnum;
4070 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
= (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4070, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
} _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
__assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4070, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
_exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4070, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
_d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
_idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4070, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
4071 ccv_nnc_graph_visit_free(graph_prep->visit);
4072 graph_prep->visit = visit;
4073 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 4073, __extension__ __PRETTY_FUNCTION__
); }))
;
4074 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4075 }
4076 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
4077 for (i = 0; i < node->graph_ref_size; i++)
4078 {
4079 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
4080 if (graph_ref >= 0)
4081 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4082 }
4083 } ccv_nnc_graph_visit_endfor} }
4084}
4085
4086const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4087
4088void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4089{
4090 assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 4090, __extension__ __PRETTY_FUNCTION__); }))
;
4091 assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4091, __extension__ __PRETTY_FUNCTION__
); }))
;
4092 assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4092, __extension__ __PRETTY_FUNCTION__
); }))
;
4093 int i;
4094 // Cannot bind the multi-view.
4095 for (i = 0; i < tensor_bind_size; i++)
4096 {
4097 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 4097, __extension__ __PRETTY_FUNCTION__
); }))
;
4098 assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 4098, __extension__ __PRETTY_FUNCTION__
); }))
;
4099 }
4100 ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4101 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4102 ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4103 _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4104 *tensor_arena_ref = tensor_arena;
4105 // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4106 _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4107 // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4108 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4109 *graph_ref = graph_prep->graph;
4110 ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4111 _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4112 _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4113 *graph_exec_arena_ref = graph_exec_arena;
4114 _ccv_nnc_symbolic_graph_prep_free(graph_prep);
4115}
4116
4117static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4118{
4119 // Buffers are inherited from above, no need to dealloc.
4120 int i;
4121 for (i = 0; i < tensor_arena->sub_arena_size; i++)
4122 if (tensor_arena->sub_arenas[i])
4123 _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4124 for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
4125 {
4126 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
(size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i)))
);
4127 assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 4127, __extension__ __PRETTY_FUNCTION__
); }))
;
4128 ccv_nnc_tensor_multiview_free(*mv);
4129 }
4130 ccv_array_free(tensor_arena->tensor_metadata);
4131 ccv_array_free(tensor_arena->m_tensor_idx);
4132 if (tensor_arena->pb_vt_tensors)
4133 ccfreefree(tensor_arena->pb_vt_tensors);
4134 if (tensor_arena->vt_alias_r_refs_p)
4135 ccfreefree(tensor_arena->vt_alias_r_refs_p);
4136 if (tensor_arena->vt_sizes)
4137 ccfreefree(tensor_arena->vt_sizes);
4138 ccfreefree(tensor_arena);
4139}
4140
4141void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4142{
4143 assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
== (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 4143, __extension__ __PRETTY_FUNCTION__
); }))
;
4144 assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4144, __extension__ __PRETTY_FUNCTION__
); }))
;
4145 assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({
if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 4145, __extension__ __PRETTY_FUNCTION__
); }))
;
4146 // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4147 int i;
4148 if (!tensor_arena->pb_vt_tensors)
4149 {
4150 tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4151 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4152 if (tensor_arena->vt_tensors[i])
4153 tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4154 }
4155 if (!tensor_arena->vt_alias_r_refs_p)
4156 {
4157 tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4158 tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4159 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4160 if (tensor_arena->vt_alias_refs[i])
4161 {
4162 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4163 assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
>= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4163, __extension__ __PRETTY_FUNCTION__
); }))
;
4164 ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4165 }
4166 int refp = 0;
4167 for (i = 0; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4168 if (tensor_arena->vt_alias_r_refs_p[i])
4169 refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4170 else
4171 tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4172 for (i = refp; i < tensor_arena->vt_tensor_size; i++)
4173 tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4174 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4175 if (tensor_arena->vt_alias_refs[i])
4176 {
4177 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4178 assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
>= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4178, __extension__ __PRETTY_FUNCTION__
); }))
;
4179 const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4180 assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if (
pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 4180, __extension__ __PRETTY_FUNCTION__); }))
;
4181 tensor_arena->vt_alias_r_refs[pos] = i;
4182 }
4183 }
4184 const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d;
4185 if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4186 {
4187 assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor
)->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4187, __extension__ __PRETTY_FUNCTION__
); }))
; // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4188 assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4190, __extension__ __PRETTY_FUNCTION__
); }))
4189 ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4190, __extension__ __PRETTY_FUNCTION__
); }))
4190 (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4190, __extension__ __PRETTY_FUNCTION__
); }))
;
4191 } else
4192 { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4192, __extension__ __PRETTY_FUNCTION__
); }))
; }
4193 if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW
)
)
4194 { assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors
[symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t
*)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else
__assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4194, __extension__ __PRETTY_FUNCTION__
); }))
; }
4195 tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4196 if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4197 for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++)
4198 {
4199 const int d = tensor_arena->vt_alias_r_refs[i];
4200 if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it.
4201 break;
4202 ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4203 d_tensor->info.datatype = tensor->info.datatype;
4204 d_tensor->info.reserved = tensor->info.reserved;
4205 if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW))
4206 ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4207 else {
4208 d_tensor->data.u8 = tensor->data.u8;
4209 d_tensor->dataof = tensor->dataof;
4210 }
4211 }
4212}
4213
4214void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4215{
4216 if (!tensor_arena->pb_vt_tensors)
4217 return;
4218 int i;
4219 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4220 if (tensor_arena->vt_tensors[i])
4221 tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4222}
4223
4224uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4225{
4226 uint64_t total_size = 0;
4227 int i;
4228 for (i = 0; i < tensor_arena->buffer_size; i++)
4229 total_size += tensor_arena->buffers[i].size;
4230 return total_size;
4231}
4232
4233static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4234{
4235 int i;
4236 if (mv->it)
4237 mv->it->info = params;
4238 for (i = 0; i < mv->repeat + mv->kind; i++)
4239 {
4240 ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i];
4241 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4242 _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4243 else
4244 tensor->info = params;
4245 }
4246}
4247
4248int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4249{
4250 int i;
4251 assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph->
tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size
) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4251, __extension__ __PRETTY_FUNCTION__
); }))
;
4252 if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4253 {
4254 tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4255 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4256 if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4257 {
4258 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4259 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4260 {
4261 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4262 while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
4263 mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
4264 tensor = (ccv_nnc_tensor_t*)mv;
4265 }
4266 tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4267 }
4268 }
4269 int flag = 0;
4270 for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++)
4271 if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4272 {
4273 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
(size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)))
;
4274 ccv_nnc_tensor_param_t params = symbol_info->info;
4275 params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4276 params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4277 flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4278 }
4279 if (flag)
4280 return -1;
4281 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4282 if (tensor_arena->vt_tensors[i])
4283 {
4284 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
(size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)))
;
4285 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4286 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4287 {
4288 assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0)
, __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else
__assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c"
, 4288, __extension__ __PRETTY_FUNCTION__); }))
;
4289 _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4290 } else if (!tensor_arena->vt_alias_refs[i]) {
4291 ccv_nnc_tensor_param_t params = symbol_info->info;
4292 params.datatype = tensor->info.datatype;
4293 params.reserved = tensor->info.reserved;
4294 tensor->info = params;
4295 } else {
4296 off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4297 ccv_nnc_tensor_param_t params = symbol_info->info;
4298 params.datatype = tensor->info.datatype;
4299 params.reserved = tensor->info.reserved;
4300 tensor->info = params;
4301 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4302 ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4303 if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4304 {
4305 ((ccv_nnc_tensor_view_t*)tensor)->off = off;
4306 memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4307 }
4308 }
4309 }
4310 // Should handle sub_tensor_arena, don't do that at the moment.
4311 assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__
({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs"
, "ccv_nnc_symbolic_graph_compile.c", 4311, __extension__ __PRETTY_FUNCTION__
); }))
;
4312 return 0;
4313}
4314
4315void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4316{
4317 assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum
>= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__
({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena
->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 4317, __extension__ __PRETTY_FUNCTION__
); }))
;
4318 int i;
4319 for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
4320 {
4321 const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4322 if (graph_exec.d < 0)
4323 continue;
4324 const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4325 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(i)))
;
4326 ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4327 if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4328 {
4329 new_cmd.backend = existing_cmd.backend;
4330 new_cmd.algorithm = existing_cmd.algorithm;
4331 }
4332 ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4333 }
4334}
4335
4336void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4337{
4338 int i;
4339 for (i = 0; i < tensor_arena->buffer_size; i++)
4340 {
4341 if (!tensor_arena->buffers[i].ptr)
4342 continue;
4343 const int buffer_type = tensor_arena->buffers[i].type;;
4344 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
4345#ifdef HAVE_CUDA1
4346 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4347 if (memory_type == CCV_TENSOR_GPU_MEMORY)
4348 {
4349 if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4350 tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4351 else
4352 cufree(device_id, tensor_arena->buffers[i].ptr);
4353 } else {
4354 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4354, __extension__ __PRETTY_FUNCTION__
); }))
;
4355 if (tensor_arena->buffers[i].pin_mem)
4356 cuhostfree(tensor_arena->buffers[i].ptr);
4357 else
4358 ccfreefree(tensor_arena->buffers[i].ptr);
4359 }
4360#elif defined(HAVE_MPS)
4361 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4362 if (memory_type == CCV_TENSOR_GPU_MEMORY)
4363 {
4364 // if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4365 // tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4366 // else
4367 mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4368 } else {
4369 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4369, __extension__ __PRETTY_FUNCTION__
); }))
;
4370 ccfreefree(tensor_arena->buffers[i].ptr);
4371 }
4372#else
4373 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4373, __extension__ __PRETTY_FUNCTION__
); }))
;
4374 ccfreefree(tensor_arena->buffers[i].ptr);
4375#endif
4376 tensor_arena->buffers[i].ptr = 0;
4377 }
4378 // For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4379 if (tensor_arena->disposers)
4380 {
4381 for (i = 0; i < tensor_arena->disposers->rnum; i++)
4382 {
4383 ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + (
size_t)(tensor_arena->disposers)->rsize * (size_t)(i)))
;
4384 disposer->dispose(disposer->ptr, disposer->userdata);
4385 }
4386 ccv_array_free(tensor_arena->disposers);
4387 tensor_arena->disposers = 0;
4388 }
4389}
4390
4391void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4392{
4393 ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4394 _ccv_nnc_tensor_arena_free(tensor_arena);
4395}
4396
4397void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4398{
4399 int i;
4400 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
4401 if (graph_exec_arena->sub_arenas[i])
4402 ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4403 ccfreefree(graph_exec_arena);
4404}