Bug Summary

File:nnc/ccv_nnc_symbolic_graph_compile.c
Warning:line 4184, column 6
Dereference of null pointer

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name ccv_nnc_symbolic_graph_compile.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -fcoverage-compilation-dir=/home/liu/actions-runner/_work/ccv/ccv/lib/nnc -resource-dir /usr/local/lib/clang/19 -I ../ -I /usr/local/cuda/include -D HAVE_CBLAS -D HAVE_LIBPNG -D HAVE_LIBJPEG -D HAVE_FFTW3 -D HAVE_PTHREAD -D HAVE_LIBLINEAR -D HAVE_TESSERACT -D HAVE_AVCODEC -D HAVE_AVFORMAT -D HAVE_AVUTIL -D HAVE_SWSCALE -D HAVE_SSE2 -D HAVE_GSL -D HAVE_CUDA -D HAVE_CUDNN -D HAVE_NCCL -D USE_SYSTEM_CUB -I /usr/local/include -internal-isystem /usr/local/lib/clang/19/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -ferror-limit 19 -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/liu/actions-runner/_work/ccv/ccv/_analyze/2026-05-17-121939-2670738-1 -x c ccv_nnc_symbolic_graph_compile.c
1#include "ccv_nnc.h"
2#include "ccv_nnc_internal.h"
3#include "ccv_nnc_easy.h"
4#include "ccv_internal.h"
5#ifdef HAVE_CUDA1
6#include "gpu/ccv_nnc_compat.h"
7#elif defined(HAVE_MPS)
8#include "mps/ccv_nnc_mps.h"
9#endif
10#include "_ccv_nnc_graph.h"
11#include "_ccv_nnc_symbolic_graph.h"
12
13// MARK - Level-3 API
14
15typedef struct {
16 int flags;
17 int type;
18 int pin_mem; // This memory need to be pinned.
19 int ref; // Reference to another tensor block. Start with 1.
20 int alias_ref; // If reference to another tensor, and the other one is an alias. Start with 1.
21 int bypass_ref; // Copy over the bypass_ref from tensor symbol underneath. Start with 1.
22 int companion_ref; // Reference to another block that they two share the same memory region. Start with 1. the current crude implementation requires the two mutually be companion. Because there are two, we took the one that companion_ref <= i as the primary and companion_ref > i is the secondary. For allocation algorithm, we use the primary throughout.
23 int unfoldable_except_ref; // Reference to a tensor block that can be the exception to unfoldable (as output). Start with 1.
24 ccv_array_t* r_refs; // If this is referenced by another block, the array point back to these blocks. Start with 1.
25 uint64_t size; // The size of the tensor expected.
26 int p_refs[2]; // Reference to the parent tensor block, at max there will be only two. Start with 1.
27 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. It could be many. Start with 0.
28 ccv_array_t* head; // The head nodes (it could be multiple if from the graph, one cannot determine which is the first).
29 ccv_array_t* tail; // The tail nodes (it could be multiple if from the graph, one cannot determine which is the last).
30} ccv_nnc_tensor_block_t; // Tensor Arena Block
31
32#define IS_PRIMARY_COMPANION(idx, block)((idx) < (uint32_t)((block).companion_ref - 1)) ((idx) < (uint32_t)((block).companion_ref - 1))
33
34enum {
35 UNASSIGNED = 0x1,
36 ALIAS = 0x2,
37 READ_ONLY = 0x4,
38 WRITE_ONLY = 0x8,
39 READ_WRITE = 0xc,
40 ANONYMOUS = 0x10, // Mark this block as anonymous (thus, not reference to any specific tensor).
41 UNFOLDABLE_AS_INPUT = 0x20, // If this block is used as input, it cannot be folded into any output blocks.
42 UNFOLDABLE_AS_OUTPUT = 0x40, // If this block is used as output, it cannot be folded into any input blocks.
43};
44
45#define TENSOR_EXPECT_ORDINARY(t)((t.flags & 0x3) == 0) ((t.flags & 0x3) == 0)
46#define TENSOR_EXPECT_SET_ORDINARY(t)(t.flags = (t.flags & ~0x3)) (t.flags = (t.flags & ~0x3))
47#define TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED) ((t.flags & 0x3) == UNASSIGNED)
48#define TENSOR_EXPECT_SET_UNASSIGNED(t)(t.flags = ((t.flags & ~0x3) | UNASSIGNED)) (t.flags = ((t.flags & ~0x3) | UNASSIGNED))
49#define TENSOR_EXPECT_UNSET_UNASSIGNED(t)(t.flags = (t.flags & ~0x1)) (t.flags = (t.flags & ~0x1))
50#define TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) ((t.flags & 0x3) == ALIAS)
51#define TENSOR_EXPECT_COMPUTABLE(t)(!((t.flags & 0x3) == ALIAS) && !((t.flags & 0x3
) == UNASSIGNED))
(!TENSOR_EXPECT_ALIAS(t)((t.flags & 0x3) == ALIAS) && !TENSOR_EXPECT_UNASSIGNED(t)((t.flags & 0x3) == UNASSIGNED))
52#define TENSOR_READ_WRITE(t)(t.flags & 0xc) (t.flags & 0xc)
53#define TENSOR_SET_READ_WRITE(t, rw)(t.flags = ((t.flags & ~0xc) | rw)) (t.flags = ((t.flags & ~0xc) | rw))
54#define TENSOR_SET_ANONYMOUS(t)(t.flags = ((t.flags & ~0x10) | ANONYMOUS)) (t.flags = ((t.flags & ~0x10) | ANONYMOUS))
55#define TENSOR_IS_ANONYMOUS(t)(t.flags & ANONYMOUS) (t.flags & ANONYMOUS)
56#define TENSOR_SET_UNFOLDABLE_AS_INPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_INPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_INPUT))
57#define TENSOR_IS_UNFOLDABLE_AS_INPUT(t)(t.flags & UNFOLDABLE_AS_INPUT) (t.flags & UNFOLDABLE_AS_INPUT)
58#define TENSOR_SET_UNFOLDABLE_AS_OUTPUT(t)(t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT)) (t.flags = (t.flags | UNFOLDABLE_AS_OUTPUT))
59#define TENSOR_IS_UNFOLDABLE_AS_OUTPUT(t)(t.flags & UNFOLDABLE_AS_OUTPUT) (t.flags & UNFOLDABLE_AS_OUTPUT)
60
61#define TENSOR_REQUIRE_INIT(flags)(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags)
& CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
(((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS) || ((flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES))
62
63// Holds additional information about the exe nodes.
64typedef struct {
65 int flags;
66} ccv_nnc_graph_exec_flag_t;
67
68enum {
69 CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO = 0x1, // Need to insert additional IO transfer for case..of statement.
70};
71
72typedef struct {
73 int index;
74 int oc;
75 int type;
76 uint64_t size;
77} ccv_nnc_tensor_opt_t;
78
79// We first sort the same type together (because they won't be reused at all.
80// And then we sort by size, after that, sort by oc.
81#define more_than(i1, i2, aux) (((i1).size > (i2).size) || ((i1).size == (i2).size && (i1).oc >= (i2).oc))
82static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_opt_sort_by_size_and_oc, ccv_nnc_tensor_opt_t, more_than)void _ccv_nnc_tensor_opt_sort_by_size_and_oc(ccv_nnc_tensor_opt_t
*array, size_t total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_opt_t
t; int sp = 0; struct { ccv_nnc_tensor_opt_t *lb; ccv_nnc_tensor_opt_t
*ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_opt_t
* left = stack[sp].lb; ccv_nnc_tensor_opt_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_opt_t
* ptr; ccv_nnc_tensor_opt_t* ptr2; if( n <= isort_thresh )
{ insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
{ for( ptr2 = ptr; ptr2 > left && more_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_opt_t
* left0; ccv_nnc_tensor_opt_t* left1; ccv_nnc_tensor_opt_t* right0
; ccv_nnc_tensor_opt_t* right1; ccv_nnc_tensor_opt_t* pivot; ccv_nnc_tensor_opt_t
* a; ccv_nnc_tensor_opt_t* b; ccv_nnc_tensor_opt_t* c; int swap_cnt
= 0; left0 = left; right0 = right; pivot = left + (n/2); if(
n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
+ 2*d; left = more_than(*a, *b, aux) ? (more_than(*b, *c, aux
) ? b : (more_than(*a, *c, aux) ? c : a)) : (more_than(*c, *b
, aux) ? b : (more_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = more_than(*a, *b, aux) ? (
more_than(*b, *c, aux) ? b : (more_than(*a, *c, aux) ? c : a)
) : (more_than(*c, *b, aux) ? b : (more_than(*a, *c, aux) ? a
: c)); a = right - 2*d, b = right - d, c = right; right = more_than
(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than(*a, *
c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
= more_than(*a, *b, aux) ? (more_than(*b, *c, aux) ? b : (more_than
(*a, *c, aux) ? c : a)) : (more_than(*c, *b, aux) ? b : (more_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !more_than(*pivot, *left
, aux) ) { if( !more_than(*left, *pivot, aux) ) { if( left >
left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
right && !more_than(*right, *pivot, aux) ) { if( !more_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
= 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
_b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
_b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
} else left = left0, right = left0 + n - 1; } else if( m >
1 ) left = right0 - m + 1, right = right0; else break; } } }
}
83#undef more_than
84typedef struct {
85 int idx;
86 int hop;
87} ccv_nnc_tensor_hop_t;
88#define less_than(i1, i2, aux) ((i1).hop < (i2).hop)
89static CCV_IMPLEMENT_QSORT(_ccv_nnc_sort_by_hops, ccv_nnc_tensor_hop_t, less_than)void _ccv_nnc_sort_by_hops(ccv_nnc_tensor_hop_t *array, size_t
total, int aux) { int isort_thresh = 7; ccv_nnc_tensor_hop_t
t; int sp = 0; struct { ccv_nnc_tensor_hop_t *lb; ccv_nnc_tensor_hop_t
*ub; } stack[48]; if( total <= 1 ) return; stack[0].lb = array
; stack[0].ub = array + (total - 1); while( sp >= 0 ) { ccv_nnc_tensor_hop_t
* left = stack[sp].lb; ccv_nnc_tensor_hop_t* right = stack[sp
--].ub; for(;;) { int i, n = (int)(right - left) + 1, m; ccv_nnc_tensor_hop_t
* ptr; ccv_nnc_tensor_hop_t* ptr2; if( n <= isort_thresh )
{ insert_sort: for( ptr = left + 1; ptr <= right; ptr++ )
{ for( ptr2 = ptr; ptr2 > left && less_than(ptr2[
0],ptr2[-1], aux); ptr2--) (((t)) = ((ptr2[0])), ((ptr2[0])) =
((ptr2[-1])), ((ptr2[-1])) = ((t))); } break; } else { ccv_nnc_tensor_hop_t
* left0; ccv_nnc_tensor_hop_t* left1; ccv_nnc_tensor_hop_t* right0
; ccv_nnc_tensor_hop_t* right1; ccv_nnc_tensor_hop_t* pivot; ccv_nnc_tensor_hop_t
* a; ccv_nnc_tensor_hop_t* b; ccv_nnc_tensor_hop_t* c; int swap_cnt
= 0; left0 = left; right0 = right; pivot = left + (n/2); if(
n > 40 ) { int d = n / 8; a = left, b = left + d, c = left
+ 2*d; left = less_than(*a, *b, aux) ? (less_than(*b, *c, aux
) ? b : (less_than(*a, *c, aux) ? c : a)) : (less_than(*c, *b
, aux) ? b : (less_than(*a, *c, aux) ? a : c)); a = pivot - d
, b = pivot, c = pivot + d; pivot = less_than(*a, *b, aux) ? (
less_than(*b, *c, aux) ? b : (less_than(*a, *c, aux) ? c : a)
) : (less_than(*c, *b, aux) ? b : (less_than(*a, *c, aux) ? a
: c)); a = right - 2*d, b = right - d, c = right; right = less_than
(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than(*a, *
c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than(
*a, *c, aux) ? a : c)); } a = left, b = pivot, c = right; pivot
= less_than(*a, *b, aux) ? (less_than(*b, *c, aux) ? b : (less_than
(*a, *c, aux) ? c : a)) : (less_than(*c, *b, aux) ? b : (less_than
(*a, *c, aux) ? a : c)); if( pivot != left0 ) { (((t)) = ((*pivot
)), ((*pivot)) = ((*left0)), ((*left0)) = ((t))); pivot = left0
; } left = left1 = left0 + 1; right = right1 = right0; for(;;
) { while( left <= right && !less_than(*pivot, *left
, aux) ) { if( !less_than(*left, *pivot, aux) ) { if( left >
left1 ) (((t)) = ((*left1)), ((*left1)) = ((*left)), ((*left
)) = ((t))); swap_cnt = 1; left1++; } left++; } while( left <=
right && !less_than(*right, *pivot, aux) ) { if( !less_than
(*pivot, *right, aux) ) { if( right < right1 ) (((t)) = ((
*right1)), ((*right1)) = ((*right)), ((*right)) = ((t))); swap_cnt
= 1; right1--; } right--; } if( left > right ) break; (((
t)) = ((*left)), ((*left)) = ((*right)), ((*right)) = ((t)));
swap_cnt = 1; left++; right--; } if( swap_cnt == 0 ) { left =
left0, right = right0; goto insert_sort; } n = ({ typeof ((int
)(left1 - left0)) _a = ((int)(left1 - left0)); typeof ((int)(
left - left1)) _b = ((int)(left - left1)); (_a < _b) ? _a :
_b; }); for( i = 0; i < n; i++ ) (((t)) = ((left0[i])), (
(left0[i])) = ((left[i-n])), ((left[i-n])) = ((t))); n = ({ typeof
((int)(right0 - right1)) _a = ((int)(right0 - right1)); typeof
((int)(right1 - right)) _b = ((int)(right1 - right)); (_a <
_b) ? _a : _b; }); for( i = 0; i < n; i++ ) (((t)) = ((left
[i])), ((left[i])) = ((right0[i-n+1])), ((right0[i-n+1])) = (
(t))); n = (int)(left - left1); m = (int)(right1 - right); if
( n > 1 ) { if( m > 1 ) { if( n > m ) { stack[++sp].
lb = left0; stack[sp].ub = left0 + n - 1; left = right0 - m +
1, right = right0; } else { stack[++sp].lb = right0 - m + 1;
stack[sp].ub = right0; left = left0, right = left0 + n - 1; }
} else left = left0, right = left0 + n - 1; } else if( m >
1 ) left = right0 - m + 1, right = right0; else break; } } }
}
90#undef less_than
91
92// If b has items overlap with a, a is still after b (inclusive).
93static int _ccv_nnc_tensor_block_a_after_b_inclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
94{
95 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 95, __extension__ __PRETTY_FUNCTION__
); }))
;
96 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 96, __extension__ __PRETTY_FUNCTION__
); }))
;
97 int x, y;
98 for (x = 0; x < b->rnum; x++)
99 {
100 const int p = *(int*)ccv_array_get(b, x)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(x)))
;
101 int flag = 0;
102 // In extreme cases where a is a superset of b, then a is still after b, we are good.
103 for (y = 0; !flag && y < a->rnum; y++)
104 {
105 const int q = *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
;
106 flag = (p == q);
107 }
108 if (!flag)
109 for (y = 0; y < a->rnum; y++)
110 {
111 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, *(int*)ccv_array_get(a, y)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(y)))
, p);
112 if (!cell.i32 || cell.i32[0] == 0)
113 return 0;
114 }
115 }
116 // If b->rnum == 0, a is after b for sure.
117 // Otherwise, if a->rnum == 0, we don't check any, buf if b->rnum > 0, then we cannot say a is after b.
118 // if both a->rnum > 0 and b->rnum > 0, above logic should checked all.
119 return (a->rnum > 0 || b->rnum == 0);
120}
121
122static int _ccv_nnc_tensor_block_a_after_b_exclusively(const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const a, const ccv_array_t* const b)
123{
124 assert(a)((void) sizeof ((a) ? 1 : 0), __extension__ ({ if (a) ; else __assert_fail
("a", "ccv_nnc_symbolic_graph_compile.c", 124, __extension__
__PRETTY_FUNCTION__); }))
;
125 assert(b)((void) sizeof ((b) ? 1 : 0), __extension__ ({ if (b) ; else __assert_fail
("b", "ccv_nnc_symbolic_graph_compile.c", 125, __extension__
__PRETTY_FUNCTION__); }))
;
126 if (!a->rnum || !b->rnum)
127 return 0;
128 int x, y, max_hop = 0;
129 for (x = 0; x < a->rnum; x++)
130 {
131 ccv_sparse_matrix_vector_t* const vector = ccv_get_sparse_matrix_vector(exec_dep, *(int*)ccv_array_get(a, x)((void*)(((char*)((a)->data)) + (size_t)(a)->rsize * (size_t
)(x)))
);
132 if (!vector)
133 return 0;
134 for (y = 0; y < b->rnum; y++)
135 {
136 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, *(int*)ccv_array_get(b, y)((void*)(((char*)((b)->data)) + (size_t)(b)->rsize * (size_t
)(y)))
);
137 if (!cell.i32 || cell.i32[0] == 0)
138 return 0;
139 if (cell.i32[0] > max_hop)
140 max_hop = cell.i32[0];
141 }
142 }
143 // We've entered this nested-for loop, therefore, it must be verifiably, deterministically after b now.
144 // The max hop also denotes if that is the case, how many hops, maximally speaking, we need to get from a to b.
145 return max_hop;
146}
147
148// If every a's head is deterministically after b's tail
149static int _ccv_nnc_tensor_block_head_after_tail(const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t a, const ccv_nnc_tensor_block_t b)
150{
151 return _ccv_nnc_tensor_block_a_after_b_exclusively(exec_dep, a.head, b.tail);
152}
153
154typedef struct {
155 ccv_array_t** alloc_dep;
156 int vt_block_size;
157 int buffer_size;
158 int block_size;
159 int* vt_blocks; // A reference to the block, because blocks only contains available block (thus, doesn't consider alias etc.). -1 means no block pointed to. Starts at 0.
160 struct {
161 int type; // The type from tensor blocks.
162 int pin_mem; // Whether this is pinned memory.
163 int flags; // The flags (currently for READ_ONLY or not).
164 uint64_t size; // The size of the buffer allocated.
165 int p_refs[2]; // Reference to the upper level block, Starts at 1. Only index 0 is valid throughout, I do use two in the code as a temporary placeholder.
166 ccv_array_t* dup_p_refs; // Reference to the parent tensor block from the duplicated tensor blocks. From buffer, it can point to multiple because it can be associated with multiple tensor blocks that points to different outputs (for example, in 1st unroll, pointing to one block while in 2nd unroll, pointing to another). Start with 0.
167 }* buffers;
168 struct {
169 int buffer_ref; // A reference for block to which buffer to use. Starts at 0.
170 int block_ref; // A reference to which block in the given tensor_block to use.
171 uint64_t offset; // The offset of this block.
172 }* blocks;
173} ccv_nnc_tensor_alloc_prep_t;
174
175typedef struct ccv_nnc_symbolic_graph_prep_s {
176 int flags;
177 int while_count_tensor; // This graph will generate a while count tensor. If this is set to 1, we reserve tensor_metadata at 0 for this.
178 int p_idx; // Reference to the index in its parent graph's sub-graph array, Starts at 1.
179 int exec_idx;
180 int unroll_count; // How many times this graph is unrolled before we can have proper assignment.
181 int tensor_symbol_info_size;
182 int exec_symbol_info_size;
183 int tensor_block_size;
184 int sub_prep_size;
185 ccv_nnc_tensor_block_t* tensor_blocks;
186 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info;
187 ccv_nnc_graph_exec_flag_t* exec_flags;
188 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info;
189 int* dup_tensor_block_ref;
190 ccv_nnc_graph_visit_t* visit;
191 ccv_nnc_tensor_alloc_prep_t* alloc_prep;
192 struct ccv_nnc_symbolic_graph_prep_s* p;
193 struct ccv_nnc_symbolic_graph_prep_s** sub_preps; // The preps of its sub-graphs.
194 // Structures that don't require to be freed after deallocation.
195 const ccv_nnc_symbolic_graph_t* symbolic_graph; // Constant because I cannot modify it.
196 ccv_nnc_graph_t* graph; // Materialized graph, not managed by prep after created.
197 ccv_nnc_tensor_arena_t* tensor_arena; // Tensor arena, not managed by prep as well.
198 ccv_array_t* dup_breakpoints; // The noop breakpoints, used to extend the inputs life-cycle for while expr.
199} ccv_nnc_symbolic_graph_prep_t;
200
201typedef struct {
202 int oc;
203 ccv_array_t* itf;
204} ccv_nnc_tensor_block_adjacent_t;
205
206static ccv_nnc_tensor_alloc_prep_t* _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
207{
208 // Compute how many dis-continuous buffers are needed.
209 // We prefer to have several dis-continuous buffers instead of one big buffer because
210 // in this way, we can rely on system memory allocators (jemalloc, tcmalloc, or CUDA's allocator)
211 // to fully utilize memory.
212 int i, j, k;
213 ccv_array_t** const alloc_dep = (ccv_array_t**)cccalloccalloc(tensor_block_size, sizeof(ccv_array_t*));
214 int allocable_tensor_size = 0, available_tensor_size = 0;
215 for (i = 0; i < tensor_block_size; i++)
216 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
217 {
218 // Tensors that we need the header info.
219 ++available_tensor_size;
220 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
221 // Tensors that we actually need to allocate (exclude the alias).
222 ++allocable_tensor_size;
223 }
224 ccv_sparse_matrix_t* const tensor_df = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
225 ccv_sparse_matrix_t* const tensor_dt = ccv_sparse_matrix_new(tensor_block_size, tensor_block_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
226 ccv_nnc_tensor_block_adjacent_t* const adj = (ccv_nnc_tensor_block_adjacent_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_adjacent_t));
227 // Overlap count.
228 for (i = 0; i < tensor_block_size; i++)
229 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
230 for (j = i + 1; j < tensor_block_size; j++)
231 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[j])(!((tensor_blocks[j].flags & 0x3) == ALIAS) && !(
(tensor_blocks[j].flags & 0x3) == UNASSIGNED))
)
232 {
233 // We only reuse buffers within the same memory type. The tensor_dt / tensor_df
234 // matrices are only queried later for same-type candidates in this function,
235 // thus cross-type hop relations are not needed for allocation planning here.
236 if (tensor_blocks[i].type != tensor_blocks[j].type)
237 continue;
238 // Check to see if they interfere (default to yes).
239 // If any of the i's head is deterministically later than j's tail
240 // or any of the i's tail is deterministically earlier than j's head, they don't interfere.
241 const int i_hop_j = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[j]);
242 int j_hop_i = 0;
243 if (i_hop_j > 0)
244 {
245 ccv_set_sparse_matrix_cell(tensor_dt, i, j, &i_hop_j);
246 ccv_set_sparse_matrix_cell(tensor_df, j, i, &i_hop_j);
247 } else {
248 // It cannot be that both directions are positive. If i can hop to j, we don't
249 // need the reverse hop value for any subsequent allocation decision.
250 j_hop_i = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[j], tensor_blocks[i]);
251 if (j_hop_i > 0)
252 {
253 ccv_set_sparse_matrix_cell(tensor_dt, j, i, &j_hop_i);
254 ccv_set_sparse_matrix_cell(tensor_df, i, j, &j_hop_i);
255 }
256 }
257 if (!i_hop_j && !j_hop_i)
258 {
259 if (!adj[i].itf)
260 adj[i].itf = ccv_array_new(sizeof(int), 1, 0);
261 ccv_array_push(adj[i].itf, &j);
262 ++adj[i].oc;
263 if (!adj[j].itf)
264 adj[j].itf = ccv_array_new(sizeof(int), 1, 0);
265 ccv_array_push(adj[j].itf, &i);
266 ++adj[j].oc;
267 }
268 }
269 const int exec_dep_rows = exec_dep->rows;
270 ccv_matrix_free(exec_dep);
271 ccv_nnc_tensor_hop_t* const buf = (ccv_nnc_tensor_hop_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_hop_t) * tensor_block_size);
272 int* const assigned = (int*)cccalloccalloc(tensor_block_size, sizeof(int));
273 uint64_t* const allocated_offset = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
274 uint64_t* const allocated_size = (uint64_t*)cccalloccalloc(tensor_block_size, sizeof(uint64_t));
275 uint32_t* const tensor_block_cannot_insert = (uint32_t*)cccalloccalloc(((tensor_block_size + 31) >> 5), sizeof(uint32_t));
276 int num_assigned = 0;
277 // I can do a bit optimization here to assign out const tensor first, but heck, this just works for now.
278 // Allocation graph (assuming there is a source node, and a destination node, which is 0, and (tensor_block_size + 1)
279 // The first channel denotes the bytes available for allocation,
280 // the second channel denotes the offset available for the allocation,
281 ccv_sparse_matrix_t* alloc = ccv_sparse_matrix_new(tensor_block_size + 2, tensor_block_size + 2, CCV_64S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
282 ccv_array_t* opt = ccv_array_new(sizeof(ccv_nnc_tensor_opt_t), 1, 0);
283 for (j = 0; j < allocable_tensor_size;)
284 {
285 // Find the one with largest overlap (in case overlap is the same, larger size), and it is not assigned.
286 uint64_t max_size = 0;
287 ccv_array_clear(opt);
288 int current_type = 0; // Deal with one type at a time.
289 for (i = 0; i < tensor_block_size; i++)
290 if (tensor_blocks[i].size >= max_size &&
291 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& !assigned[i] &&
292 IS_PRIMARY_COMPANION(i, tensor_blocks[i])((i) < (uint32_t)((tensor_blocks[i]).companion_ref - 1)) &&
293 (!current_type || tensor_blocks[i].type == current_type))
294 {
295 ccv_nnc_tensor_opt_t a = {
296 .size = tensor_blocks[i].size,
297 .index = i,
298 .oc = adj[i].oc,
299 .type = tensor_blocks[i].type,
300 };
301 assert(a.type)((void) sizeof ((a.type) ? 1 : 0), __extension__ ({ if (a.type
) ; else __assert_fail ("a.type", "ccv_nnc_symbolic_graph_compile.c"
, 301, __extension__ __PRETTY_FUNCTION__); }))
;
302 current_type = a.type; // Now we now the primary type we should deal with.
303 if (tensor_blocks[i].companion_ref)
304 {
305 const int companion_ref = tensor_blocks[i].companion_ref - 1;
306 a.size = ccv_max(a.size, tensor_blocks[companion_ref].size)({ typeof (a.size) _a = (a.size); typeof (tensor_blocks[companion_ref
].size) _b = (tensor_blocks[companion_ref].size); (_a > _b
) ? _a : _b; })
;
307 a.oc += adj[companion_ref].oc;
308 }
309 // In case we have a tie, take them all in the array.
310 if (a.size > max_size)
311 ccv_array_clear(opt), max_size = a.size;
312 ccv_array_push(opt, &a);
313 }
314 assert(opt->rnum > 0)((void) sizeof ((opt->rnum > 0) ? 1 : 0), __extension__
({ if (opt->rnum > 0) ; else __assert_fail ("opt->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 314, __extension__ __PRETTY_FUNCTION__
); }))
;
315 // Order opt array by the oc because type and size should be equal at this point.
316 _ccv_nnc_tensor_opt_sort_by_size_and_oc((ccv_nnc_tensor_opt_t*)opt->data, opt->rnum, 0);
317 // Go through opt array again, this time, it is ordered by size, therefore, if we found a place to insert, we are good.
318 int min_y = 0, min_x = tensor_block_size + 1, min_i = -1, min_hop = exec_dep_rows * 3;
319 uint64_t min_val[2] = {
320 0, 0
321 };
322 if (j > 0)
323 {
324 for (i = 0; i < opt->rnum; i++)
325 {
326 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, i)((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(i)))
;
327 if ((tensor_block_cannot_insert[a.index >> 5] & (1u << (a.index & 0x1f))))
328 continue;
329 // Now, determine the order between a and c. After this, we can always check whether y
330 // can hop to the earliest one and if the latest one can hop to x.
331 // The earliest one will be called p and the latest one will be called q.
332 int p = a.index;
333 int q = a.index;
334 if (tensor_blocks[a.index].companion_ref)
335 {
336 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
337 if ((tensor_block_cannot_insert[companion_ref >> 5] & (1u << (companion_ref & 0x1f))))
338 continue;
339 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, p, companion_ref);
340 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
341 p = companion_ref;
342 else {
343 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, q);
344 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
345 q = companion_ref;
346 else { // Otherwise, b is in between p and q.
347 const ccv_numeric_data_t p_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, p);
348 const ccv_numeric_data_t b_hop_q = ccv_get_sparse_matrix_cell(tensor_dt, q, companion_ref);
349 assert(p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0)((void) sizeof ((p_hop_b.i32 && p_hop_b.i32[0] > 0
&& b_hop_q.i32 && b_hop_q.i32[0] > 0) ? 1
: 0), __extension__ ({ if (p_hop_b.i32 && p_hop_b.i32
[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] >
0) ; else __assert_fail ("p_hop_b.i32 && p_hop_b.i32[0] > 0 && b_hop_q.i32 && b_hop_q.i32[0] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 349, __extension__ __PRETTY_FUNCTION__
); }))
;
350 }
351 }
352 }
353 assert(tensor_blocks[q].type == tensor_blocks[p].type)((void) sizeof ((tensor_blocks[q].type == tensor_blocks[p].type
) ? 1 : 0), __extension__ ({ if (tensor_blocks[q].type == tensor_blocks
[p].type) ; else __assert_fail ("tensor_blocks[q].type == tensor_blocks[p].type"
, "ccv_nnc_symbolic_graph_compile.c", 353, __extension__ __PRETTY_FUNCTION__
); }))
;
354 const int type = tensor_blocks[p].type;
355 // y is always earlier than x, but this is hard to assert now.
356 // If this edge satisfy the requirement, now we need to find the ones with tightest possible bounds.
357 // Thus, the hop between y and x (through a) should be smallest ones.
358 // We optimized this by first find all allocated nodes that comes to p, and all allocated nodes that
359 // out of q. For these nodes, we try to verify whether they form a connection (by checking against
360 // alloc sparse matrix). If they do, try to see whether we can insert with tightest bound.
361 int y_size = 0;
362 ccv_nnc_tensor_hop_t* const y_buf = buf;
363#define for_block(y, val) do { \
364 if (((int*)val)[0] > 0 && assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size >= a.size) \
365 y_buf[y_size++] = (ccv_nnc_tensor_hop_t){ \
366 .idx = y + 1, .hop = ((int*)val)[0] \
367 }; \
368 } while(0)
369 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
370 if (y_vector)
371 CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
372#undef for_block
373 assert(y_size <= tensor_block_size)((void) sizeof ((y_size <= tensor_block_size) ? 1 : 0), __extension__
({ if (y_size <= tensor_block_size) ; else __assert_fail (
"y_size <= tensor_block_size", "ccv_nnc_symbolic_graph_compile.c"
, 373, __extension__ __PRETTY_FUNCTION__); }))
;
374 int x_size = 0;
375 ccv_nnc_tensor_hop_t* const x_buf = buf + y_size;
376#define for_block(x, val) do { \
377 if (((int*)val)[0] > 0 && assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size >= a.size) \
378 x_buf[x_size++] = (ccv_nnc_tensor_hop_t){ \
379 .idx = x + 1, .hop = ((int*)val)[0] \
380 }; \
381 } while(0)
382 ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
383 if (x_vector)
384 CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
385#undef for_block
386 assert(y_size + x_size <= tensor_block_size)((void) sizeof ((y_size + x_size <= tensor_block_size) ? 1
: 0), __extension__ ({ if (y_size + x_size <= tensor_block_size
) ; else __assert_fail ("y_size + x_size <= tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 386, __extension__ __PRETTY_FUNCTION__
); }))
;
387 int x, y;
388 if (y_size > 1)
389 _ccv_nnc_sort_by_hops(y_buf, y_size, 0);
390 for (y = 0; y < y_size; y++)
391 {
392 const int hop = exec_dep_rows + y_buf[y].hop;
393 if (hop >= min_hop)
394 break;
395 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, y_buf[y].idx, tensor_block_size + 1);
396 if (val.u64 && val.u64[0] >= a.size)
397 {
398 min_y = y_buf[y].idx, min_x = tensor_block_size + 1, min_hop = hop,
399 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
400 break;
401 }
402 }
403 if (x_size > 1)
404 _ccv_nnc_sort_by_hops(x_buf, x_size, 0);
405 for (x = 0; x < x_size; x++)
406 {
407 const int hop = exec_dep_rows + x_buf[x].hop;
408 if (hop >= min_hop)
409 break;
410 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell(alloc, 0, x_buf[x].idx);
411 if (val.u64 && val.u64[0] >= a.size)
412 {
413 min_y = 0, min_x = x_buf[x].idx, min_hop = hop,
414 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
415 break;
416 }
417 }
418 if (x_size > 0)
419 {
420 const int x_min_hop = x_buf[0].hop;
421 for (y = 0; y < y_size; y++)
422 {
423 const int y_hop_p_v = y_buf[y].hop;
424 if (y_hop_p_v + x_min_hop >= min_hop)
425 break;
426 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(alloc, y_buf[y].idx);
427 if (y_vector)
428 {
429 for (x = 0; x < x_size; x++)
430 {
431 const int q_hop_x_v = x_buf[x].hop;
432 const int hop = y_hop_p_v + q_hop_x_v;
433 if (hop >= min_hop)
434 break;
435 const ccv_numeric_data_t val = ccv_get_sparse_matrix_cell_from_vector(alloc, y_vector, x_buf[x].idx);
436 if (val.u64 && val.u64[0] >= a.size)
437 {
438 min_y = y_buf[y].idx, min_x = x_buf[x].idx, min_hop = hop,
439 min_val[0] = val.u64[0], min_val[1] = val.u64[1];
440 break;
441 }
442 }
443 }
444 }
445 }
446 // If I found a place, stop, and exit.
447 if (min_y > 0 || min_x < tensor_block_size + 1)
448 {
449 min_i = i;
450 break;
451 }
452 // There is no space to insert this block, mark it as such.
453 tensor_block_cannot_insert[a.index >> 5] |= (1u << (a.index & 0x1f));
454 if (tensor_blocks[a.index].companion_ref)
455 {
456 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
457 tensor_block_cannot_insert[companion_ref >> 5] |= (1u << (companion_ref & 0x1f));
458 }
459 }
460 }
461 // If I cannot find a place, then start a new connection between min_y and min_x (a new assignment group).
462 // and default to largest size available.
463 ccv_nnc_tensor_opt_t a = *(ccv_nnc_tensor_opt_t*)ccv_array_get(opt, ccv_max(0, min_i))((void*)(((char*)((opt)->data)) + (size_t)(opt)->rsize *
(size_t)(({ typeof (0) _a = (0); typeof (min_i) _b = (min_i)
; (_a > _b) ? _a : _b; }))))
;
464 if (min_i == -1)
465 {
466 allocated_size[num_assigned] = a.size;
467 ++num_assigned;
468 }
469 int assign_group = num_assigned;
470 if (min_y > 0)
471 {
472 assign_group = assigned[min_y - 1];
473 // The y and x should belong to the same assigned group.
474 assert(min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group)((void) sizeof ((min_x == tensor_block_size + 1 || assigned[min_x
- 1] == assign_group) ? 1 : 0), __extension__ ({ if (min_x ==
tensor_block_size + 1 || assigned[min_x - 1] == assign_group
) ; else __assert_fail ("min_x == tensor_block_size + 1 || assigned[min_x - 1] == assign_group"
, "ccv_nnc_symbolic_graph_compile.c", 474, __extension__ __PRETTY_FUNCTION__
); }))
;
475 } else if (min_x < tensor_block_size + 1)
476 assign_group = assigned[min_x - 1];
477 // If min_y is source and min_x is destination, we don't need to do anything, otherwise, decrease the weight on that edge.
478 if (min_y != 0 || min_x != tensor_block_size + 1)
479 {
480 uint64_t val[2] = {
481 min_val[0], min_val[1]
482 };
483 assert(val[0] >= a.size)((void) sizeof ((val[0] >= a.size) ? 1 : 0), __extension__
({ if (val[0] >= a.size) ; else __assert_fail ("val[0] >= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 483, __extension__ __PRETTY_FUNCTION__
); }))
;
484 val[0] -= a.size;
485 val[1] = val[1] + a.size; // Move the offset to the next one.
486 ccv_set_sparse_matrix_cell(alloc, min_y, min_x, val);
487 }
488 int strings[3];
489 strings[0] = a.index + 1;
490 int string_size = 1;
491 // Assign out designated companion if it exist.
492 if (tensor_blocks[a.index].companion_ref)
493 {
494 const int companion_ref = tensor_blocks[a.index].companion_ref - 1;
495 assert(tensor_blocks[a.index].type == tensor_blocks[companion_ref].type)((void) sizeof ((tensor_blocks[a.index].type == tensor_blocks
[companion_ref].type) ? 1 : 0), __extension__ ({ if (tensor_blocks
[a.index].type == tensor_blocks[companion_ref].type) ; else __assert_fail
("tensor_blocks[a.index].type == tensor_blocks[companion_ref].type"
, "ccv_nnc_symbolic_graph_compile.c", 495, __extension__ __PRETTY_FUNCTION__
); }))
;
496 const ccv_numeric_data_t b_hop_p = ccv_get_sparse_matrix_cell(tensor_dt, strings[0] - 1, companion_ref);
497 if (b_hop_p.i32 && b_hop_p.i32[0] > 0)
498 {
499 for (i = 0; i < string_size; i++)
500 strings[i + 1] = strings[i];
501 strings[0] = companion_ref + 1;
502 } else {
503 const ccv_numeric_data_t q_hop_b = ccv_get_sparse_matrix_cell(tensor_dt, companion_ref, strings[string_size - 1] - 1);
504 if (q_hop_b.i32 && q_hop_b.i32[0] > 0)
505 strings[string_size] = companion_ref + 1;
506 else {
507 // Because b_hop_p is 0, q_hop_b is nil, p != q, and b must in between p and q. Therefore, I must have 2 allocations.
508 assert(string_size == 2)((void) sizeof ((string_size == 2) ? 1 : 0), __extension__ ({
if (string_size == 2) ; else __assert_fail ("string_size == 2"
, "ccv_nnc_symbolic_graph_compile.c", 508, __extension__ __PRETTY_FUNCTION__
); }))
;
509 strings[2] = strings[1];
510 strings[1] = companion_ref + 1;
511 }
512 }
513 ++string_size;
514 }
515 // Assign out and update oc.
516 for (i = 0; i < string_size; i++)
517 {
518 const int index = strings[i] - 1;
519 // Assign out the selected one.
520 assigned[index] = assign_group;
521 // The offset for this one, should be either 0 (started a new group, when min_i == -1), or the offset on this edge.
522 allocated_offset[index] = min_val[1];
523 if (adj[index].itf)
524 for (k = 0; k < adj[index].itf->rnum; k++)
525 {
526 const int d = *(int*)ccv_array_get(adj[index].itf, k)((void*)(((char*)((adj[index].itf)->data)) + (size_t)(adj[
index].itf)->rsize * (size_t)(k)))
;
527 if (!assigned[d] && TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])(!((tensor_blocks[d].flags & 0x3) == ALIAS) && !(
(tensor_blocks[d].flags & 0x3) == UNASSIGNED))
)
528 --adj[d].oc;
529 }
530 }
531 uint64_t val[2] = {
532 a.size, min_val[1]
533 };
534 uint64_t consumed_size = 0;
535 // Go over from min_y to string_size (excluding min_x).
536 for (i = 0; i < string_size; i++)
537 {
538 const uint64_t size = tensor_blocks[strings[i] - 1].size;
539 assert(size <= a.size)((void) sizeof ((size <= a.size) ? 1 : 0), __extension__ (
{ if (size <= a.size) ; else __assert_fail ("size <= a.size"
, "ccv_nnc_symbolic_graph_compile.c", 539, __extension__ __PRETTY_FUNCTION__
); }))
;
540 // Update consumed size if it is bigger than "size".
541 if (size > consumed_size)
542 {
543 val[0] = size - consumed_size;
544 ccv_set_sparse_matrix_cell(alloc, min_y, strings[i], val);
545 consumed_size = size;
546 val[1] = min_val[1] + consumed_size;
547 }
548 // If it consumed all the flow, break out.
549 if (consumed_size == a.size)
550 break;
551 }
552 for (i = 0; i < string_size; i++)
553 {
554 const uint64_t i_size = tensor_blocks[strings[i] - 1].size;
555 uint64_t val[2] = {
556 i_size, min_val[1]
557 };
558 uint64_t consumed_size = 0;
559 for (k = i + 1; k < string_size; k++)
560 {
561 const uint64_t size = ccv_min(i_size, tensor_blocks[strings[k] - 1].size)({ typeof (i_size) _a = (i_size); typeof (tensor_blocks[strings
[k] - 1].size) _b = (tensor_blocks[strings[k] - 1].size); (_a
< _b) ? _a : _b; })
;
562 // Update consumed size if it is bigger than "size".
563 if (size > consumed_size)
564 {
565 val[0] = size - consumed_size;
566 ccv_set_sparse_matrix_cell(alloc, strings[i], strings[k], val);
567 consumed_size = size;
568 val[1] = min_val[1] + consumed_size;
569 }
570 // If it consumed all the flow, break out.
571 if (consumed_size == i_size)
572 break;
573 }
574 val[0] = i_size - consumed_size;
575 // Still have residual, flow it to min_x.
576 if (val[0] > 0)
577 ccv_set_sparse_matrix_cell(alloc, strings[i], min_x, val);
578 }
579 if (min_i == -1)
580 {
581 // If we decide to insert a new edge, simply marking anyone who is not interfere with it to redo.
582 const int p = strings[0] - 1;
583 const int q = strings[string_size - 1] - 1;
584 const int type = tensor_blocks[p].type;
585#define for_block(y, val) do { \
586 if (((int*)val)[0] > 0 && !assigned[y] && tensor_blocks[y].type == type && tensor_blocks[y].size <= a.size) \
587 { \
588 tensor_block_cannot_insert[y >> 5] &= ~(1u << (y & 0x1f)); \
589 if (tensor_blocks[y].companion_ref) \
590 { \
591 const int companion_ref = tensor_blocks[y].companion_ref - 1; \
592 tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
593 } \
594 } \
595 } while(0)
596 ccv_sparse_matrix_vector_t* const y_vector = ccv_get_sparse_matrix_vector(tensor_dt, p);
597 if (y_vector)
598 CCV_SPARSE_VECTOR_FOREACH(tensor_dt, y_vector, for_block)do { switch ((((tensor_dt)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_dt)->type) & 0xFFF); if ((tensor_dt)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (y_vector)->
size; _i_++) { for_block((_i_), ((y_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_dt)->type) & 0xFF000
) >> 12] * (((tensor_dt)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(y_vector)->index;
for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_dt
)->type) & 0xFFF); if ((tensor_dt)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (y_vector)->size; _i_++) { for_block
((_i_), ((y_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_dt)->type) & 0xFF000) >> 12] * (((tensor_dt
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(y_vector)->index; for (_i_ = 0; _i_ < (y_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
599#undef for_block
600#define for_block(x, val) do { \
601 if (((int*)val)[0] > 0 && !assigned[x] && tensor_blocks[x].type == type && tensor_blocks[x].size <= a.size) \
602 { \
603 tensor_block_cannot_insert[x >> 5] &= ~(1u << (x & 0x1f)); \
604 if (tensor_blocks[x].companion_ref) \
605 { \
606 const int companion_ref = tensor_blocks[x].companion_ref - 1; \
607 tensor_block_cannot_insert[companion_ref >> 5] &= ~(1u << (companion_ref & 0x1f)); \
608 } \
609 } \
610 } while(0)
611 ccv_sparse_matrix_vector_t* const x_vector = ccv_get_sparse_matrix_vector(tensor_df, q);
612 if (x_vector)
613 CCV_SPARSE_VECTOR_FOREACH(tensor_df, x_vector, for_block)do { switch ((((tensor_df)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f32 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.i64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((tensor_df)->type) & 0xFFF); if ((tensor_df)->type
& CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (x_vector)->
size; _i_++) { for_block((_i_), ((x_vector)->data.f64 + (_i_
* _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((tensor_df)->type) & 0xFF000
) >> 12] * (((tensor_df)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(x_vector)->index;
for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((tensor_df
)->type) & 0xFFF); if ((tensor_df)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (x_vector)->size; _i_++) { for_block
((_i_), ((x_vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((tensor_df)->type) & 0xFF000) >> 12] * (((tensor_df
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(x_vector)->index; for (_i_ = 0; _i_ < (x_vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
614#undef for_block
615 }
616 j += string_size;
617 }
618 ccfreefree(tensor_block_cannot_insert);
619 ccfreefree(buf);
620 ccv_array_free(opt);
621 ccv_matrix_free(tensor_df);
622 ccv_matrix_free(tensor_dt);
623#define for_block(y, x, val) do { \
624 if (((uint64_t*)val)[0] > 0 && y > 0 && x < tensor_block_size + 1) \
625 { \
626 if (!alloc_dep[x - 1]) \
627 alloc_dep[x - 1] = ccv_array_new(sizeof(int), 1, 0); \
628 ccv_array_add_unique_int(alloc_dep[x - 1], y - 1); \
629 } \
630 } while (0)
631 CCV_SPARSE_FOREACH(alloc, for_block)do { if ((alloc)->major & CCV_SPARSE_COL_MAJOR) { switch
((((alloc)->type) & 0xFF000)) { case CCV_32S: { do { uint32_t
_i_, _j_; const uint32_t _size_ = (alloc)->size; __attribute__
((unused)) const size_t _c_ = (((alloc)->type) & 0xFFF
); if ((alloc)->type & CCV_DENSE_VECTOR) { for (_i_ = 0
; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t* const _idx_
= (alloc)->index + _i_; ccv_sparse_matrix_vector_t* const
_v_ = (alloc)->vector + _i_; if (_idx_->ifbit <= 1 ||
!_v_->size) continue; for (_j_ = 0; _j_ < _v_->size
; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.i32 +
(_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_j_), (_idx_->i), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_j_->
i), (_idx_->i), (_d_.u8 + (0))); } } } } while (0); } } } else
{ switch ((((alloc)->type) & 0xFF000)) { case CCV_32S
: { do { uint32_t _i_, _j_; const uint32_t _size_ = (alloc)->
size; __attribute__((unused)) const size_t _c_ = (((alloc)->
type) & 0xFFF); if ((alloc)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i32 + (0))); } } } } while (0); break
; } case CCV_32F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f32 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f32 + (0))); } } } } while (0); break
; } case CCV_64S: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
i64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.i64 + (0))); } } } } while (0); break
; } case CCV_64F: { do { uint32_t _i_, _j_; const uint32_t _size_
= (alloc)->size; __attribute__((unused)) const size_t _c_
= (((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
f64 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.f64 + (0))); } } } } while (0); break
; } default: { do { uint32_t _i_, _j_; const uint32_t _size_ =
(alloc)->size; __attribute__((unused)) const size_t _c_ =
(((alloc)->type) & 0xFFF); if ((alloc)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < _size_; _i_++) { ccv_sparse_matrix_index_t
* const _idx_ = (alloc)->index + _i_; ccv_sparse_matrix_vector_t
* const _v_ = (alloc)->vector + _i_; if (_idx_->ifbit <=
1 || !_v_->size) continue; for (_j_ = 0; _j_ < _v_->
size; _j_++) { for_block((_idx_->i), (_j_), (_v_->data.
u8 + (_j_ * _c_))); } } } else { const size_t _idx_size_ = sizeof
(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((alloc
)->type) & 0xFF000) >> 12] * (((alloc)->type)
& 0xFFF) + 3) & -4); for (_i_ = 0; _i_ < _size_; _i_
++) { ccv_sparse_matrix_index_t* const _idx_ = (alloc)->index
+ _i_; ccv_sparse_matrix_vector_t* const _v_ = (alloc)->vector
+ _i_; if (_idx_->ifbit <= 1 || !_v_->rnum) continue
; uint8_t* const _vidx_ = (uint8_t*)_v_->index; for (_j_ =
0; _j_ < _v_->size; _j_++) { ccv_sparse_matrix_index_t
* const _idx_j_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _j_); if (_idx_j_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_j_ + 1) }; for_block((_idx_->
i), (_idx_j_->i), (_d_.u8 + (0))); } } } } while (0); } } }
} while (0)
;
632#undef for_block
633 ccv_matrix_free(alloc);
634 for (i = 0; i < tensor_block_size; i++)
635 if (adj[i].itf)
636 ccv_array_free(adj[i].itf);
637 ccfreefree(adj);
638 ccv_nnc_tensor_alloc_prep_t* alloc_prep = (ccv_nnc_tensor_alloc_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_alloc_prep_t) + sizeof(alloc_prep->blocks[0]) * available_tensor_size + sizeof(alloc_prep->buffers[0]) * num_assigned + sizeof(int) * tensor_block_size);
639 alloc_prep->alloc_dep = alloc_dep;
640 alloc_prep->vt_block_size = tensor_block_size;
641 alloc_prep->buffer_size = num_assigned;
642 alloc_prep->block_size = available_tensor_size;
643 alloc_prep->blocks = (void*)(alloc_prep + 1); // From the biggest structs to smaller ones.
644 alloc_prep->buffers = (void*)(alloc_prep->blocks + available_tensor_size);
645 alloc_prep->vt_blocks = (int*)(alloc_prep->buffers + num_assigned);
646 memset(alloc_prep->buffers, 0, sizeof(alloc_prep->buffers[0]) * num_assigned);
647 for (i = 0; i < num_assigned; i++)
648 alloc_prep->buffers[i].size = allocated_size[i];
649 if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_INFO)(CCV_CLI_INFO & ccv_cli_get_output_levels()))
650 {
651 size_t total_size = 0;
652 for (i = 0; i < num_assigned; i++)
653 total_size += allocated_size[i];
654 PRINT(CCV_CLI_INFO, "Total buffer size of %zu to be allocated\n", total_size)do { if ((CCV_CLI_INFO & ccv_cli_get_output_levels())) { printf
("Total buffer size of %zu to be allocated\n", total_size); fflush
(stdout); } } while (0)
;
655 }
656 ccfreefree(allocated_size);
657 j = 0;
658 // Assigning out the tensors (in case of sharing tensors / in-place ops).
659 for (i = 0; i < tensor_block_size; i++)
660 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
661 {
662 alloc_prep->blocks[j].block_ref = i;
663 if (!TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
664 {
665 alloc_prep->vt_blocks[i] = j;
666 // Also, set its allocations.
667 assert(assigned[i] > 0)((void) sizeof ((assigned[i] > 0) ? 1 : 0), __extension__ (
{ if (assigned[i] > 0) ; else __assert_fail ("assigned[i] > 0"
, "ccv_nnc_symbolic_graph_compile.c", 667, __extension__ __PRETTY_FUNCTION__
); }))
;
668 const int buffer_ref = alloc_prep->blocks[j].buffer_ref = assigned[i] - 1;
669 alloc_prep->blocks[j].offset = allocated_offset[i];
670 if (!alloc_prep->buffers[buffer_ref].type)
671 alloc_prep->buffers[buffer_ref].type = tensor_blocks[i].type;
672 alloc_prep->buffers[buffer_ref].pin_mem = alloc_prep->buffers[buffer_ref].pin_mem || tensor_blocks[i].pin_mem;
673 alloc_prep->buffers[buffer_ref].flags |= TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc);
674 assert(allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size)((void) sizeof ((allocated_offset[i] + tensor_blocks[i].size <=
alloc_prep->buffers[buffer_ref].size) ? 1 : 0), __extension__
({ if (allocated_offset[i] + tensor_blocks[i].size <= alloc_prep
->buffers[buffer_ref].size) ; else __assert_fail ("allocated_offset[i] + tensor_blocks[i].size <= alloc_prep->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 674, __extension__ __PRETTY_FUNCTION__
); }))
;
675 } else {
676 alloc_prep->vt_blocks[i] = -1;
677 alloc_prep->blocks[j].buffer_ref = -1;
678 alloc_prep->blocks[j].offset = 0;
679 }
680 ++j;
681 } else
682 alloc_prep->vt_blocks[i] = -1;
683 ccfreefree(allocated_offset);
684 ccfreefree(assigned);
685 return alloc_prep;
686}
687
688static void _ccv_nnc_tensor_alloc_prep_free(ccv_nnc_tensor_alloc_prep_t* alloc_prep)
689{
690 int i;
691 for (i = 0; i < alloc_prep->vt_block_size; i++)
692 if (alloc_prep->alloc_dep[i])
693 ccv_array_free(alloc_prep->alloc_dep[i]);
694 for (i = 0; i < alloc_prep->buffer_size; i++)
695 if (alloc_prep->buffers[i].dup_p_refs)
696 ccv_array_free(alloc_prep->buffers[i].dup_p_refs);
697 ccfreefree(alloc_prep->alloc_dep);
698 ccfreefree(alloc_prep);
699}
700
701// Simple allocator from ccv_array_t.
702static int _ccv_nnc_tensor_metadata_pos_new(ccv_array_t* const tensor_metadata, const size_t size)
703{
704 int pos = tensor_metadata->rnum;
705 int rsize = (size + 15) / 16;
706 ccv_array_resize(tensor_metadata, pos + rsize);
707 return (pos << 1) + 1;
708}
709
710static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_get(const ccv_array_t* const tensor_metadata, const int pos)
711{
712 assert((pos >> 1) < tensor_metadata->rnum)((void) sizeof (((pos >> 1) < tensor_metadata->rnum
) ? 1 : 0), __extension__ ({ if ((pos >> 1) < tensor_metadata
->rnum) ; else __assert_fail ("(pos >> 1) < tensor_metadata->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 712, __extension__ __PRETTY_FUNCTION__
); }))
;
713 return (ccv_nnc_tensor_t*)ccv_array_get(tensor_metadata, pos >> 1)((void*)(((char*)((tensor_metadata)->data)) + (size_t)(tensor_metadata
)->rsize * (size_t)(pos >> 1)))
;
714}
715
716#define CCV_NNC_IS_METADATA_POS(ptr)((uintptr_t)(ptr) & 1) ((uintptr_t)(ptr) & 1)
717
718static ccv_nnc_tensor_t* _ccv_nnc_tensor_metadata_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_t* const vt_tensor)
719{
720 // If the low bit is not 1, this is not a position (but a normal tensor pointer), just return directly.
721 if (!CCV_NNC_IS_METADATA_POS(vt_tensor)((uintptr_t)(vt_tensor) & 1))
722 return vt_tensor;
723 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)vt_tensor);
724 if (tensor->alias_ref && CCV_NNC_IS_METADATA_POS(tensor->alias_ref)((uintptr_t)(tensor->alias_ref) & 1))
725 {
726 const int alias_ref = tensor->alias_ref;
727 tensor->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)tensor->alias_ref);
728 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)alias_ref);
729 }
730 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
731 {
732 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
733 int i;
734 const int count = mv->kind + mv->repeat;
735 for (i = 0; i < count; i++)
736 {
737 if (CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)(((mv)->_heap_data ? (mv)->_heap_data : (mv
)->_inline_data)[i]) & 1)
)
738 {
739 const int pos = (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i];
740 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
741 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
742 }
743 }
744 // No need to recursively do parent pointer, otherwise we are in deep rewire.
745 if (mv->p && CCV_NNC_IS_METADATA_POS(mv->p)((uintptr_t)(mv->p) & 1))
746 mv->p = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)mv->p);
747 if (mv->sp)
748 for (i = 0; i < mv->sp->rnum; i++)
749 {
750 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
751 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
752 {
753 const int pos = (int)(intptr_t)*tensor;
754 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
755 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 755, __extension__ __PRETTY_FUNCTION__
); }))
;
756 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
757 }
758 }
759 }
760 return tensor;
761}
762
763typedef struct {
764 const uint8_t* ptr;
765 int pos;
766} ccv_nnc_tensor_block_pos_t;
767
768static int _ccv_nnc_tensor_multiview_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const int block_ref, const int* const ch, const int idx, const ccv_nnc_symbolic_graph_prep_t* prep)
769{
770 int i;
771 int unref_block_ref = block_ref;
772 while (prep->tensor_blocks[unref_block_ref].ref)
773 unref_block_ref = prep->tensor_blocks[unref_block_ref].ref - 1;
774 int vt_ref = prep->alloc_prep->vt_blocks[unref_block_ref];
775 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 775, __extension__ __PRETTY_FUNCTION__); }))
;
776 assert(unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((unref_block_ref == prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (unref_block_ref
== prep->alloc_prep->blocks[vt_ref].block_ref) ; else __assert_fail
("unref_block_ref == prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 776, __extension__ __PRETTY_FUNCTION__
); }))
;
777 const int buffer_ref = prep->alloc_prep->blocks[vt_ref].buffer_ref;
778 uint64_t offset = prep->alloc_prep->blocks[vt_ref].offset;
779 int p_ref = prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
780 for (i = idx - 1; i >= 0; i--)
781 {
782 assert(p_ref >= 0)((void) sizeof ((p_ref >= 0) ? 1 : 0), __extension__ ({ if
(p_ref >= 0) ; else __assert_fail ("p_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 782, __extension__ __PRETTY_FUNCTION__); }))
;
783 const ccv_nnc_symbolic_graph_prep_t* const graph_prep = preps[i];
784 const int unroll_count = graph_prep->unroll_count;
785 if (ch[i]) // Prefer the dup side of things.
786 p_ref = graph_prep->dup_tensor_block_ref[p_ref * unroll_count + ch[i] - 1];
787 int unref_p_ref = p_ref;
788 while (graph_prep->tensor_blocks[unref_p_ref].ref)
789 unref_p_ref = graph_prep->tensor_blocks[unref_p_ref].ref - 1;
790 vt_ref = graph_prep->alloc_prep->vt_blocks[unref_p_ref];
791 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
792 offset += graph_prep->alloc_prep->blocks[vt_ref].offset;
793 // If the buffer already exists, prefer that.
794 const uint8_t* ptr = graph_prep->tensor_arena->buffers[buffer_ref].ptr;
795 if (ptr)
796 {
797 // If I have any remaining path that is not covered from 0, I cannot possibly
798 // have any pointer from buffer (that can only happen if it is not dup).
799 for (--i; i >= 0; i--)
800 if (ch[i] != 0)
801 return 0;
802 // Try to find the created tensor block pos in the array, just linear scan.
803 const int tv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
804 ccv_nnc_tensor_t* const tv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, tv_pos);
805 *tv = ccv_nnc_tensor(graph_prep->tensor_arena->buffers[buffer_ref].ptr, params, 0);
806 ccv_nnc_tensor_data_add(tv->info, offset, &tv->data, &tv->dataof);
807 return tv_pos;
808 }
809 p_ref = graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1;
810 }
811 return 0;
812}
813
814// Descent from root to the prep level, and compose multiview from there.
815static int _ccv_nnc_tensor_multiview_down_find_pos(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const int preserve, const int assign_update, const ccv_nnc_symbolic_graph_prep_t* const *const preps, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref, int* ch, const int idx, int* const pos_ref)
816{
817 assert(pos_ref)((void) sizeof ((pos_ref) ? 1 : 0), __extension__ ({ if (pos_ref
) ; else __assert_fail ("pos_ref", "ccv_nnc_symbolic_graph_compile.c"
, 817, __extension__ __PRETTY_FUNCTION__); }))
;
818 int i;
819 const ccv_nnc_symbolic_graph_prep_t* const prep = preps[idx];
820 const int unroll_count = prep->unroll_count;
821 if (prep == graph_prep)
822 {
823 const int data_pos = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, block_ref, ch, idx, prep);
824 if (!data_pos)
825 return -1;
826 // Based on ch, go all the way back to find the exact pointer to compose.
827 if (// !assign_update && // If I plan to receive assign update, we don't need to have multiple receiver. Just one tensor to receive update is enough.
828 prep->dup_tensor_block_ref &&
829 prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
830 prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref)
831 {
832 int pos[unroll_count + 1];
833 pos[0] = data_pos;
834 for (i = 0; i < unroll_count; i++)
835 pos[i + 1] = _ccv_nnc_tensor_multiview_find_pos(tensor_metadata, params, preps, prep->dup_tensor_block_ref[block_ref * unroll_count + i], ch, idx, prep);
836 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
837 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
838 ccv_nnc_tensor_t* data[unroll_count + 1];
839 for (i = 0; i < unroll_count + 1; i++)
840 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
841 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
842 for (i = 0; i < unroll_count + 1; i++)
843 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
844 *pos_ref = mv_pos;
845 } else {
846 *pos_ref = data_pos;
847 }
848 if (preserve)
849 {
850 // If need to preserve, this need to be more complicated. At loop 0, I need to access the new assigned tv.
851 // at any other loops, it should be the same. Thus, for this case, I will create a mv tensor as following:
852 // mv of K11, thus, when loop is 0, it unwrap to mv->data[0], otherwise, unwrap to mv->data[1].
853 // mv->data[0] (thin_mv) is a K01, which points to the assigned tv (using 1 as a placeholder here until parent
854 // arena allocated).
855 // mv->data[1] (prev_mv_pos_ is a K01 or K02, depending on whether above we passed raw pointer directly or
856 // a mv structure. If we pass a mv structure, we just pass it here. If we pass a raw pointer, we need to wrap
857 // it to a K01 structure.
858 // Why we didn't wrap it directly as mv->data[0] pointing to a assigned tv pointer and the mv->data[1] pointing
859 // to the raw pointer (as ptr_ref) with K11? The reason is we don't know the assigned tv is pointing to one
860 // memory region, or is a managed by multi-view tensor, which could pointing to different memory regions.
861 int prev_mv_pos = *pos_ref;
862 if (prev_mv_pos == -1)
863 {
864 prev_mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
865 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
866 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_metadata, data_pos);
867 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
868 tv,
869 }, CCV_NNC_MULTIVIEW_K0N, 1, prep->graph, mv);
870 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)data_pos;
871 }
872 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
873 ccv_nnc_tensor_multiview_t* const prev_mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, prev_mv_pos);
874 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
875 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
876 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
877 (ccv_nnc_tensor_t*)prev_mv,
878 }, CCV_NNC_MULTIVIEW_K1N, 1, prep->graph, mv);
879 prev_mv->p = (void*)(intptr_t)mv_pos;
880 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
881 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)prev_mv_pos;
882 *pos_ref = mv_pos;
883 }
884 return 0;
885 }
886 ch[idx] = 0;
887 int pos[unroll_count + 1];
888 pos[0] = 0;
889 const int retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos);
890 assert(retval == 0)((void) sizeof ((retval == 0) ? 1 : 0), __extension__ ({ if (
retval == 0) ; else __assert_fail ("retval == 0", "ccv_nnc_symbolic_graph_compile.c"
, 890, __extension__ __PRETTY_FUNCTION__); }))
;
891 for (i = 0; i < unroll_count; i++)
892 {
893 ch[idx] = i + 1;
894 pos[i + 1] = 0;
895 const int dup_retval = _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, idx + 1, pos + i + 1);
896 if (dup_retval < 0)
897 {
898 assert(i == 0)((void) sizeof ((i == 0) ? 1 : 0), __extension__ ({ if (i == 0
) ; else __assert_fail ("i == 0", "ccv_nnc_symbolic_graph_compile.c"
, 898, __extension__ __PRETTY_FUNCTION__); }))
;
899 break;
900 }
901 }
902 // If current prep has no dup.
903 if (i == 0)
904 {
905 *pos_ref = pos[0];
906 return 0;
907 }
908 ccv_nnc_tensor_t* data[unroll_count + 1];
909 // Compose to a new multiview.
910 for (i = 0; i < unroll_count + 1; i++)
911 { assert(pos[i] > 0)((void) sizeof ((pos[i] > 0) ? 1 : 0), __extension__ ({ if
(pos[i] > 0) ; else __assert_fail ("pos[i] > 0", "ccv_nnc_symbolic_graph_compile.c"
, 911, __extension__ __PRETTY_FUNCTION__); }))
; }
912 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
913 for (i = 0; i < unroll_count + 1; i++)
914 data[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos[i]);
915 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
916 ccv_nnc_tensor_multiview(data, CCV_NNC_MULTIVIEW_K0N, unroll_count + 1, prep->graph, mv);
917 for (i = 0; i < unroll_count + 1; i++)
918 if (data[i] != CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)) && CCV_IS_TENSOR_MULTIVIEW(data[i])((*(int*)(data[i])) & CCV_TENSOR_MULTIVIEW))
919 ((ccv_nnc_tensor_multiview_t*)data[i])->p = (void*)(intptr_t)mv_pos;
920 for (i = 0; i < unroll_count + 1; i++)
921 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = (ccv_nnc_tensor_t*)(intptr_t)pos[i];
922 *pos_ref = mv_pos;
923 return 0;
924}
925
926static int _ccv_nnc_is_symbolic_graph_exec_input_or_output(const int p_ref, const ccv_nnc_graph_exec_symbol_info_t *const node)
927{
928 int i;
929 int is_input = 0;
930 assert(node)((void) sizeof ((node) ? 1 : 0), __extension__ ({ if (node) ;
else __assert_fail ("node", "ccv_nnc_symbolic_graph_compile.c"
, 930, __extension__ __PRETTY_FUNCTION__); }))
;
931 for (i = 0; i < node->input_size && !is_input; i++)
932 if (p_ref == node->inputs[i])
933 is_input = 1;
934 int is_output = 0;
935 for (i = 0; i < node->output_size && !is_output; i++)
936 if (p_ref == node->outputs[i])
937 is_output = 1;
938 // Prefer it is an output if it is both the input and the output.
939 if (is_output)
940 return 1;
941 if (is_input)
942 return -1;
943 return 0;
944}
945
946static int _ccv_nnc_tensor_block_check_preserve(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
947{
948 // No need to check whether to preserve if this is not a while loop.
949 if (!(graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
950 return 0;
951 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 951, __extension__ __PRETTY_FUNCTION__
); }))
;
952 // If it is unassigned, no need to preserve.
953 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
954 return 0;
955 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
956 // If p is not input, no need to preserve at all.
957 if (-1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
958 return 0;
959 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
960 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 960, __extension__ __PRETTY_FUNCTION__); }))
;
961 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 961, __extension__ __PRETTY_FUNCTION__
); }))
;
962 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
963 // If the buffer is a truly read-only one, no need to preserve.
964 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
965 return 0;
966 /* This needs detailed explanation, what does preserve mean?
967 * For a parameterized loop, such as while { y = x + 1 } (y => x), if tensor x is
968 * also used outside of the while loop, we cannot reuse the memory region of x for
969 * the for loop, otherwise we will destroy x when doing y = x + 1 computation (assuming
970 * y uses the same memory region as x). The way to workaround this is by using a different
971 * memory region for y = x + 1, but for the first iteration, having x pointing to the
972 * original. During the allocation process, the way to identify whether x should preserve
973 * its value or not by looking up its parent tensor. If the symbol (tensor_block)'s input
974 * parent tensor is the same as the memory region it plans to use in the buffer, then we are
975 * good (buffer.p_refs[0] == p_refs[0]). A buffer can only point to one parent tensor, and
976 * it is the input tensor whenever that is possible. A tensor block can point to two parent
977 * tensors, one is input tensor, one is the output tensor. p_refs[0] should be the input
978 * tensor whenever that is possible. */
979 if (graph_prep->alloc_prep->buffers[buffer_ref].p_refs[0] - 1 == p_ref)
980 return 0;
981 // Otherwise, return 1 because we now need to preserve.
982 return 1;
983}
984
985static int _ccv_nnc_tensor_block_check_force_broadcast(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int block_ref)
986{
987 assert(block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size)((void) sizeof ((block_ref >= 0 && block_ref < graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("block_ref >= 0 && block_ref < graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 987, __extension__ __PRETTY_FUNCTION__
); }))
;
988 // If it is unassigned, no need to preserve.
989 if (TENSOR_EXPECT_UNASSIGNED(graph_prep->tensor_blocks[block_ref])((graph_prep->tensor_blocks[block_ref].flags & 0x3) ==
UNASSIGNED)
)
990 return 0;
991 // Only tape var need to force broadcast, otherwise we already share the same memory region.
992 if (!(graph_prep->tensor_symbol_info[block_ref].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR))
993 return 0;
994 const int p_ref = graph_prep->tensor_blocks[block_ref].p_refs[0] - 1;
995 // If p is not output, no need to broadcast at all.
996 if (1 != _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref, graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1)))
997 return 0;
998 const int vt_ref = graph_prep->alloc_prep->vt_blocks[block_ref];
999 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 999, __extension__ __PRETTY_FUNCTION__); }))
;
1000 assert(block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref)((void) sizeof ((block_ref == graph_prep->alloc_prep->blocks
[vt_ref].block_ref) ? 1 : 0), __extension__ ({ if (block_ref ==
graph_prep->alloc_prep->blocks[vt_ref].block_ref) ; else
__assert_fail ("block_ref == graph_prep->alloc_prep->blocks[vt_ref].block_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1000, __extension__ __PRETTY_FUNCTION__
); }))
;
1001 const int buffer_ref = graph_prep->alloc_prep->blocks[vt_ref].buffer_ref;
1002 // If the buffer is a truly read-only one, no need to broadcast.
1003 if (TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[buffer_ref])(graph_prep->alloc_prep->buffers[buffer_ref].flags &
0xc)
== READ_ONLY)
1004 return 0;
1005 // Otherwise, return 1 because we now need to force broadcast for this tape var.
1006 return 1;
1007}
1008
1009static void _ccv_nnc_tensor_multiview_full_pos(ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_t* const tensor)
1010{
1011 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1011, __extension__ __PRETTY_FUNCTION__); }))
;
1012 int i;
1013 for (i = 0; i < mv->kind + mv->repeat; i++)
1014 if (CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] == CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)))
1015 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = tensor;
1016 else if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
1017 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i], tensor);
1018}
1019
1020static void _ccv_nnc_tensor_multiview_full_pos_rewire(const ccv_array_t* const tensor_metadata, ccv_nnc_tensor_multiview_t* const mv)
1021{
1022 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1022, __extension__ __PRETTY_FUNCTION__); }))
;
1023 int i;
1024 if (mv->sp)
1025 for (i = 0; i < mv->sp->rnum; i++)
1026 {
1027 ccv_nnc_tensor_t** const tensor = (ccv_nnc_tensor_t**)ccv_array_get(mv->sp, i)((void*)(((char*)((mv->sp)->data)) + (size_t)(mv->sp
)->rsize * (size_t)(i)))
;
1028 if (CCV_NNC_IS_METADATA_POS(*tensor)((uintptr_t)(*tensor) & 1))
1029 {
1030 const int pos = (int)(intptr_t)*tensor;
1031 *tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1032 assert(!CCV_IS_TENSOR_MULTIVIEW(*tensor))((void) sizeof ((!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(*tensor)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(*tensor)",
"ccv_nnc_symbolic_graph_compile.c", 1032, __extension__ __PRETTY_FUNCTION__
); }))
;
1033 _ccv_nnc_tensor_metadata_rewire(tensor_metadata, (ccv_nnc_tensor_t*)(intptr_t)pos);
1034 }
1035 }
1036 for (i = 0; i < mv->kind + mv->repeat; i++)
1037 {
1038 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i])((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]) & 1)
)
1039 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i] = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1040 if (CCV_NNC_IS_METADATA_POS((int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->alias_ref)((uintptr_t)((int)(intptr_t)((mv)->_heap_data ? (mv)->_heap_data
: (mv)->_inline_data)[i]->alias_ref) & 1)
)
1041 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref = (uintptr_t)_ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]->alias_ref);
1042 if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])((*(int*)(((mv)->_heap_data ? (mv)->_heap_data : (mv)->
_inline_data)[i])) & CCV_TENSOR_MULTIVIEW)
)
1043 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_metadata, (ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1044 }
1045}
1046
1047static int _ccv_nnc_tensor_multiview_gen(ccv_array_t* const tensor_metadata, const int preserve, const int assign_update, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena, const int block_ref)
1048{
1049 // Go to the root of the graph.
1050 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
1051 int i;
1052 for (i = 1; prep->p; i++)
1053 prep = prep->p;
1054 // Root graph should have no dup tensor blocks.
1055 assert(!prep->dup_tensor_block_ref)((void) sizeof ((!prep->dup_tensor_block_ref) ? 1 : 0), __extension__
({ if (!prep->dup_tensor_block_ref) ; else __assert_fail (
"!prep->dup_tensor_block_ref", "ccv_nnc_symbolic_graph_compile.c"
, 1055, __extension__ __PRETTY_FUNCTION__); }))
;
1056 const int c = i;
1057 const ccv_nnc_symbolic_graph_prep_t* preps[c];
1058 prep = graph_prep;
1059 preps[c - 1] = prep;
1060 for (i = 0; prep->p; i++)
1061 preps[c - 2 - i] = prep = prep->p;
1062 int ch[c]; // Use dynamic allocation for array. This is an array to record our selections when recursive from top to bottom.
1063 memset(ch, 0, sizeof(int) * c);
1064 int pos = 0;
1065 _ccv_nnc_tensor_multiview_down_find_pos(tensor_metadata, params, preserve, assign_update, preps, graph_prep, block_ref, ch, 0, &pos);
1066 assert(ch[c - 1] == 0)((void) sizeof ((ch[c - 1] == 0) ? 1 : 0), __extension__ ({ if
(ch[c - 1] == 0) ; else __assert_fail ("ch[c - 1] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1066, __extension__ __PRETTY_FUNCTION__); }))
; // This shouldn't never be modified.
1067 assert(pos > 0)((void) sizeof ((pos > 0) ? 1 : 0), __extension__ ({ if (pos
> 0) ; else __assert_fail ("pos > 0", "ccv_nnc_symbolic_graph_compile.c"
, 1067, __extension__ __PRETTY_FUNCTION__); }))
;
1068 return pos;
1069}
1070
1071static int _ccv_nnc_tensor_multiview_preserve_gen(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_param_t params, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_t* const tensor)
1072{
1073 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1074 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, mv_pos);
1075 ccv_nnc_tensor_t* const tv = CCV_NNC_IS_METADATA_POS(tensor)((uintptr_t)(tensor) & 1) ? _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)tensor) : tensor;
1076 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1077 CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10)),
1078 tv,
1079 }, CCV_NNC_MULTIVIEW_K1N, 1, graph_prep->graph, mv);
1080 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = CCV_NNC_TENSOR_PLACEHOLDER((ccv_nnc_tensor_t*)(intptr_t)(0x10));
1081 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = tensor;
1082 return mv_pos;
1083}
1084
1085static int _ccv_nnc_tensor_flat_if_multiview(ccv_array_t* const tensor_metadata, const int pos)
1086{
1087 ccv_nnc_tensor_t* tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1088 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1089 if (!is_multiview)
1090 return pos;
1091 while (CCV_IS_TENSOR_MULTIVIEW(tensor_ptr)((*(int*)(tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1092 {
1093 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)tensor_ptr;
1094 tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1095 }
1096 const ccv_nnc_tensor_t tensor = *tensor_ptr;
1097 const int new_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1098 ccv_nnc_tensor_t* const new_tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, new_pos);
1099 *new_tensor = ccv_nnc_tensor(tensor.data.u8, tensor.info, 0);
1100 new_tensor->dataof = tensor.dataof;
1101 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1102 new_tensor->alias_ref = (uintptr_t)pos;
1103 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)new_pos);
1104 return new_pos;
1105}
1106
1107static void _ccv_nnc_assign_vt_tensor_aliases(ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1108{
1109 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1110 // It referenced to is not an alias.
1111 assert(vt_tensors[alias_ref])((void) sizeof ((vt_tensors[alias_ref]) ? 1 : 0), __extension__
({ if (vt_tensors[alias_ref]) ; else __assert_fail ("vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1111, __extension__ __PRETTY_FUNCTION__
); }))
;
1112 const int alias_pos = (int)(intptr_t)vt_tensors[alias_ref];
1113 const ccv_nnc_tensor_t* alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1114 assert(!CCV_IS_TENSOR_VIEW(alias_tensor_ptr))((void) sizeof ((!((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_VIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(alias_tensor_ptr
)) & CCV_TENSOR_VIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_VIEW(alias_tensor_ptr)"
, "ccv_nnc_symbolic_graph_compile.c", 1114, __extension__ __PRETTY_FUNCTION__
); }))
;
1115 // Will use that to determine whether insert reference or not.
1116 const int is_multiview = CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW);
1117 while (CCV_IS_TENSOR_MULTIVIEW(alias_tensor_ptr)((*(int*)(alias_tensor_ptr)) & CCV_TENSOR_MULTIVIEW))
1118 {
1119 const ccv_nnc_tensor_multiview_t* const mv = (const ccv_nnc_tensor_multiview_t*)alias_tensor_ptr;
1120 alias_tensor_ptr = _ccv_nnc_tensor_metadata_get(tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1121 }
1122 const ccv_nnc_tensor_t alias_tensor = *alias_tensor_ptr;
1123 // If there is no ofs, and inc is the same as dim, we take a shortcut and just init as normal tensor.
1124 int pos;
1125 if (memcmp(ccv_nnc_no_ofs, tensor_symbol_info[block_ref].ofs, sizeof(ccv_nnc_no_ofs)) == 0 &&
1126 ccv_nnc_is_tensor_stride_packed(tensor_symbol_info[block_ref].stride, tensor_symbol_info[block_ref].info.dim))
1127 {
1128 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_t));
1129 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1130 *tensor = ccv_nnc_tensor(alias_tensor.data.u8, tensor_symbol_info[block_ref].info, 0);
1131 tensor->dataof = alias_tensor.dataof;
1132 } else {
1133 pos = _ccv_nnc_tensor_metadata_pos_new(tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1134 ccv_nnc_tensor_view_t* const tensor_view = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, pos);
1135 // Otherwise initialize a tensor view
1136 *tensor_view = ccv_nnc_tensor_view(&alias_tensor, tensor_symbol_info[block_ref].info, tensor_symbol_info[block_ref].ofs, tensor_symbol_info[block_ref].stride);
1137 tensor_view->alias_ref = (uintptr_t)alias_pos;
1138 }
1139 vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1140 if (is_multiview)
1141 {
1142 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_metadata, alias_pos);
1143 ccv_nnc_tensor_synchronize_to_multiview(mv, (ccv_nnc_tensor_t*)(intptr_t)pos);
1144 }
1145}
1146
1147static void _ccv_nnc_recursively_assign_vt_tensor_aliases(const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_array_t* const tensor_metadata, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int block_ref, ccv_nnc_tensor_t** const vt_tensors)
1148{
1149 // If this is an alias_ref and it hasn't been assigned, it must be an alias itself. Do this recursively.
1150 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[block_ref].alias_ref && !vt_tensors[block_ref])
1151 {
1152 const int ref = tensor_blocks[block_ref].alias_ref - 1;
1153 if (!vt_tensors[ref])
1154 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, ref, vt_tensors);
1155 vt_tensors[block_ref] = vt_tensors[ref];
1156 return;
1157 }
1158 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1158, __extension__ __PRETTY_FUNCTION__
); }))
;
1159 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1160 // If we don't have vt_tensors, this must be a ref with alias_ref (through folding). If that is the case, do this recursively until all aliases assigned.
1161 if (!vt_tensors[alias_ref])
1162 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_metadata, tensor_symbol_info, alias_ref, vt_tensors);
1163 _ccv_nnc_assign_vt_tensor_aliases(tensor_metadata, tensor_symbol_info, block_ref, vt_tensors);
1164}
1165
1166// Turn a linear pointer to an object storage (such as MTLBuffer).
1167#ifdef HAVE_MPS
1168static void _ccv_nnc_tensor_arena_obj_dispose(void* ptr, void* userdata)
1169{
1170 mpobjfree(0, ptr);
1171}
1172#endif
1173
1174typedef struct {
1175 size_t size;
1176 void* obj;
1177} tensor_arena_obj_track_t;
1178
1179typedef struct {
1180 void* ptr;
1181 off_t offset;
1182 size_t size;
1183} obj_ptr_key_t;
1184
1185static inline khint32_t _kh_obj_ptr_hash_func(const obj_ptr_key_t key)
1186{
1187 return ((uint64_t)(uintptr_t)key.ptr >> 4) + key.offset + key.size;
1188}
1189
1190static inline int _kh_obj_ptr_hash_equal(const obj_ptr_key_t a, const obj_ptr_key_t b)
1191{
1192 return (a.ptr == b.ptr && a.offset == b.offset && a.size == b.size);
1193}
1194
1195KHASH_INIT(obj_ptr, obj_ptr_key_t, void*, 1, _kh_obj_ptr_hash_func, _kh_obj_ptr_hash_equal)typedef struct kh_obj_ptr_s { khint_t n_buckets, size, n_occupied
, upper_bound; khint32_t *flags; obj_ptr_key_t *keys; void* *
vals; } kh_obj_ptr_t; static inline __attribute__ ((__unused__
)) kh_obj_ptr_t *kh_init_obj_ptr(void) { return (kh_obj_ptr_t
*)calloc(1,sizeof(kh_obj_ptr_t)); } static inline __attribute__
((__unused__)) void kh_destroy_obj_ptr(kh_obj_ptr_t *h) { if
(h) { free((void *)h->keys); free(h->flags); free((void
*)h->vals); free(h); } } static inline __attribute__ ((__unused__
)) void kh_clear_obj_ptr(kh_obj_ptr_t *h) { if (h && h
->flags) { memset(h->flags, 0xaa, ((h->n_buckets) <
16? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h->
size = h->n_occupied = 0; } } static inline __attribute__ (
(__unused__)) khint_t kh_get_obj_ptr(const kh_obj_ptr_t *h, obj_ptr_key_t
key) { if (h->n_buckets) { khint_t k, i, last, mask, step
= 0; mask = h->n_buckets - 1; k = _kh_obj_ptr_hash_func(key
); i = k & mask; last = i; while (!((h->flags[i>>
4]>>((i&0xfU)<<1))&2) && (((h->
flags[i>>4]>>((i&0xfU)<<1))&1) || !
_kh_obj_ptr_hash_equal(h->keys[i], key))) { i = (i + (++step
)) & mask; if (i == last) return h->n_buckets; } return
((h->flags[i>>4]>>((i&0xfU)<<1))&
3)? h->n_buckets : i; } else return 0; } static inline __attribute__
((__unused__)) int kh_resize_obj_ptr(kh_obj_ptr_t *h, khint_t
new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (
--(new_n_buckets), (new_n_buckets)|=(new_n_buckets)>>1,
(new_n_buckets)|=(new_n_buckets)>>2, (new_n_buckets)|=
(new_n_buckets)>>4, (new_n_buckets)|=(new_n_buckets)>>
8, (new_n_buckets)|=(new_n_buckets)>>16, ++(new_n_buckets
)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size
>= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0
; else { new_flags = (khint32_t*)malloc(((new_n_buckets) <
16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t)); if (
!new_flags) return -1; memset(new_flags, 0xaa, ((new_n_buckets
) < 16? 1 : (new_n_buckets)>>4) * sizeof(khint32_t))
; if (h->n_buckets < new_n_buckets) { obj_ptr_key_t *new_keys
= (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets *
sizeof(obj_ptr_key_t)); if (!new_keys) { free(new_flags); return
-1; } h->keys = new_keys; if (1) { void* *new_vals = (void
**)realloc((void *)h->vals,new_n_buckets * sizeof(void*));
if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals
; } } } } if (j) { for (j = 0; j != h->n_buckets; ++j) { if
(((h->flags[j>>4]>>((j&0xfU)<<1))&
3) == 0) { obj_ptr_key_t key = h->keys[j]; void* val; khint_t
new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals
[j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<<
1)); while (1) { khint_t k, i, step = 0; k = _kh_obj_ptr_hash_func
(key); i = k & new_mask; while (!((new_flags[i>>4]>>
((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask
; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<<
1))); if (i < h->n_buckets && ((h->flags[i>>
4]>>((i&0xfU)<<1))&3) == 0) { { obj_ptr_key_t
tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1
) { void* tmp = h->vals[i]; h->vals[i] = val; val = tmp
; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<
1)); } else { h->keys[i] = key; if (1) h->vals[i] = val
; break; } } } } if (h->n_buckets > new_n_buckets) { h->
keys = (obj_ptr_key_t*)realloc((void *)h->keys,new_n_buckets
* sizeof(obj_ptr_key_t)); if (1) h->vals = (void**)realloc
((void *)h->vals,new_n_buckets * sizeof(void*)); } free(h->
flags); h->flags = new_flags; h->n_buckets = new_n_buckets
; h->n_occupied = h->size; h->upper_bound = (khint_t
)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static
inline __attribute__ ((__unused__)) khint_t kh_put_obj_ptr(kh_obj_ptr_t
*h, obj_ptr_key_t key, int *ret) { khint_t x; if (h->n_occupied
>= h->upper_bound) { if (h->n_buckets > (h->size
<<1)) { if (kh_resize_obj_ptr(h, h->n_buckets - 1) <
0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_obj_ptr
(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets
; } } { khint_t k, i, site, last, mask = h->n_buckets - 1,
step = 0; x = site = h->n_buckets; k = _kh_obj_ptr_hash_func
(key); i = k & mask; if (((h->flags[i>>4]>>
((i&0xfU)<<1))&2)) x = i; else { last = i; while
(!((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && (((h->flags[i>>4]>>((i&0xfU)
<<1))&1) || !_kh_obj_ptr_hash_equal(h->keys[i], key
))) { if (((h->flags[i>>4]>>((i&0xfU)<<
1))&1)) site = i; i = (i + (++step)) & mask; if (i ==
last) { x = site; break; } } if (x == h->n_buckets) { if (
((h->flags[i>>4]>>((i&0xfU)<<1))&
2) && site != h->n_buckets) x = site; else x = i; }
} } if (((h->flags[x>>4]>>((x&0xfU)<<
1))&2)) { h->keys[x] = key; (h->flags[x>>4]&=
~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->
n_occupied; *ret = 1; } else if (((h->flags[x>>4]>>
((x&0xfU)<<1))&1)) { h->keys[x] = key; (h->
flags[x>>4]&=~(3ul<<((x&0xfU)<<1)))
; ++h->size; *ret = 2; } else *ret = 0; return x; } static
inline __attribute__ ((__unused__)) void kh_del_obj_ptr(kh_obj_ptr_t
*h, khint_t x) { if (x != h->n_buckets && !((h->
flags[x>>4]>>((x&0xfU)<<1))&3)) { (
h->flags[x>>4]|=1ul<<((x&0xfU)<<1));
--h->size; } }
1196
1197static inline void* _ccv_nnc_tensor_arena_obj_create(khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map, void* ptr, const size_t total_size, const off_t offset, const ccv_nnc_tensor_param_t params, ccv_nnc_tensor_arena_t* tensor_arena)
1198{
1199 if (params.dim[0] == 0)
1200 return 0;
1201#ifdef HAVE_MPS
1202 if (CCV_TENSOR_GET_MEMORY(params.type)((params.type) & 0x3) == CCV_TENSOR_GPU_MEMORY)
1203 {
1204 int ret;
1205 const size_t size = CCV_GET_DATA_TYPE_SIZE(params.datatype)_ccv_get_data_type_size[((params.datatype) & 0xFF000) >>
12]
* ccv_nnc_tensor_count(params);
1206 const obj_ptr_key_t key = {
1207 .ptr = ptr,
1208 .offset = offset,
1209 .size = size,
1210 };
1211 khiter_t k = kh_put(obj_ptr, obj_ptr_map, key, &ret)kh_put_obj_ptr(obj_ptr_map, key, &ret);
1212 if (ret != 0)
1213 {
1214 void* obj = mpobjcreate(ptr, offset, size);
1215 if (!tensor_arena->disposers)
1216 tensor_arena->disposers = ccv_array_new(sizeof(ccv_nnc_arena_disposer_t), 1, 0);
1217 ccv_nnc_arena_disposer_t disposer = {
1218 .ptr = obj,
1219 .userdata = 0,
1220 .dispose = _ccv_nnc_tensor_arena_obj_dispose
1221 };
1222 ccv_array_push(tensor_arena->disposers, &disposer);
1223 kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]) = obj;
1224 return obj;
1225 } else
1226 return kh_val(obj_ptr_map, k)((obj_ptr_map)->vals[k]);
1227 }
1228#endif
1229 return ptr + offset;
1230}
1231
1232static ccv_nnc_tensor_arena_t* _ccv_nnc_tensor_arena_new(ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_compile_allocator_t allocator, const ccv_nnc_tensor_arena_t* const p_arena, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size)
1233{
1234 // All tensors assigned out, now, the num_assigned is the number of dis-continuous buffers,
1235 // Each tensor have the designation in assigned array, and offset in allocated_offset.
1236 const ccv_nnc_tensor_alloc_prep_t* const alloc_prep = graph_prep->alloc_prep;
1237 ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
1238 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
1239 const int tensor_symbol_info_size = graph_prep->tensor_symbol_info_size;
1240 const ccv_nnc_symbolic_graph_prep_t* const p_graph_prep = graph_prep->p;
1241 const ccv_nnc_tensor_alloc_prep_t* const p_alloc_prep = p_graph_prep ? p_graph_prep->alloc_prep : 0;
1242 const int* const dup_tensor_block_ref = graph_prep->dup_tensor_block_ref;
1243 const int unroll_count = graph_prep->unroll_count;
1244 int i, j;
1245 for (i = 0; i < tensor_symbol_info_size; i++)
1246 for (j = 0; TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && j < unroll_count; j++)
1247 {
1248 const int dup_ref = dup_tensor_block_ref[i * unroll_count + j];
1249 if (dup_ref >= 0 && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[dup_ref])((tensor_blocks[dup_ref].flags & 0x3) == UNASSIGNED))
1250 TENSOR_EXPECT_UNSET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags & ~0x1)
)
;
1251 }
1252 ccv_nnc_tensor_arena_t* tensor_arena = (ccv_nnc_tensor_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_arena_t) + sizeof(tensor_arena->buffers[0]) * alloc_prep->buffer_size + sizeof(ccv_nnc_tensor_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size + sizeof(int) * tensor_symbol_info_size);
1253 graph_prep->tensor_arena = tensor_arena;
1254 tensor_arena->graph_ref = (intptr_t)graph_prep->symbolic_graph;
1255 tensor_arena->buffers = (void*)(tensor_arena + 1);
1256 tensor_arena->buffer_size = alloc_prep->buffer_size;
1257 tensor_arena->vt_tensor_size = tensor_symbol_info_size;
1258 tensor_arena->sub_arenas = (ccv_nnc_tensor_arena_t**)(tensor_arena->buffers + alloc_prep->buffer_size);
1259 tensor_arena->vt_tensors = (ccv_nnc_tensor_t**)(tensor_arena->sub_arenas + graph_prep->sub_prep_size);
1260 tensor_arena->vt_alias_refs = (int*)(tensor_arena->vt_tensors + tensor_symbol_info_size);
1261 tensor_arena->pb_vt_tensors = 0;
1262 tensor_arena->vt_alias_r_refs_p = 0;
1263 tensor_arena->vt_alias_r_refs = 0;
1264 tensor_arena->vt_sizes = 0;
1265 tensor_arena->sub_arena_size = graph_prep->sub_prep_size;
1266 tensor_arena->tensor_metadata = ccv_array_new(16 /* align to 16 bytes */, (sizeof(ccv_nnc_tensor_t) * tensor_symbol_info_size + 15) / 16, 0);
1267 tensor_arena->m_tensor_idx = ccv_array_new(sizeof(int), 0, 0);
1268 tensor_arena->allocator.context.free = allocator.context.free;
1269 tensor_arena->allocator.isa = allocator.isa;
1270 tensor_arena->disposers = 0;
1271 // Copy alias_ref info back to the tensor arena.
1272 for (i = 0; i < tensor_symbol_info_size; i++)
1273 tensor_arena->vt_alias_refs[i] = tensor_symbol_info[i].alias_ref;
1274 // Do the buffer copies.
1275 for (i = 0; i < alloc_prep->buffer_size; i++)
1276 tensor_arena->buffers[i].type = alloc_prep->buffers[i].type,
1277 tensor_arena->buffers[i].pin_mem = alloc_prep->buffers[i].pin_mem,
1278 tensor_arena->buffers[i].size = alloc_prep->buffers[i].size;
1279 if (graph_prep->while_count_tensor)
1280 {
1281 // If we need to have a while count tensor, allocate that first, set its pointer to point the while_count variable.
1282 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1283 assert((0 << 1) + 1 == pos)((void) sizeof (((0 << 1) + 1 == pos) ? 1 : 0), __extension__
({ if ((0 << 1) + 1 == pos) ; else __assert_fail ("(0 << 1) + 1 == pos"
, "ccv_nnc_symbolic_graph_compile.c", 1283, __extension__ __PRETTY_FUNCTION__
); }))
; // pos must be 0 position.
1284 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1285 *tensor = ccv_nnc_tensor_for_while_count(graph_prep->graph);
1286 }
1287 assert((p_arena && p_graph_prep) || (!p_arena && !p_graph_prep))((void) sizeof (((p_arena && p_graph_prep) || (!p_arena
&& !p_graph_prep)) ? 1 : 0), __extension__ ({ if ((p_arena
&& p_graph_prep) || (!p_arena && !p_graph_prep
)) ; else __assert_fail ("(p_arena && p_graph_prep) || (!p_arena && !p_graph_prep)"
, "ccv_nnc_symbolic_graph_compile.c", 1287, __extension__ __PRETTY_FUNCTION__
); }))
;
1288 if (p_arena && p_graph_prep)
1289 {
1290 // Don't need to allocate the actual buffer, just use the pointer from the above.
1291 PRINT(CCV_CLI_VERBOSE, "Buffer assignment for sub arena %p (parent %p)\n", tensor_arena, p_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer assignment for sub arena %p (parent %p)\n",
tensor_arena, p_arena); fflush(stdout); } } while (0)
;
1292 for (i = 0; i < tensor_arena->buffer_size; i++)
1293 {
1294 const int p_ref = alloc_prep->buffers[i].p_refs[0] - 1;
1295 int unref_p_ref = p_ref;
1296 while (p_graph_prep->tensor_blocks[unref_p_ref].ref)
1297 unref_p_ref = p_graph_prep->tensor_blocks[unref_p_ref].ref - 1;
1298 assert(unref_p_ref >= 0)((void) sizeof ((unref_p_ref >= 0) ? 1 : 0), __extension__
({ if (unref_p_ref >= 0) ; else __assert_fail ("unref_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 1298, __extension__ __PRETTY_FUNCTION__
); }))
;
1299 const int p_unroll_count = p_graph_prep->unroll_count;
1300 if (p_graph_prep->dup_tensor_block_ref &&
1301 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] >= 0 &&
1302 p_graph_prep->dup_tensor_block_ref[p_ref * p_unroll_count] != p_ref)
1303 {
1304 // This condition means in the parent graph, we point to multiple tensor blocks for the same
1305 // buffer, therefore, we cannot have one single pointer assigned in this case.
1306 // Later we will handle this by generate ccv_tensor_multiview_t structure.
1307 tensor_arena->buffers[i].ptr = 0;
1308 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1309 continue;
1310 }
1311 // Otherwise, find the actual buffer pointer.
1312 const int vt_ref = p_alloc_prep->vt_blocks[unref_p_ref];
1313 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1313, __extension__ __PRETTY_FUNCTION__); }))
;
1314 const int buffer_ref = p_alloc_prep->blocks[vt_ref].buffer_ref;
1315 if (!p_arena->buffers[buffer_ref].ptr)
1316 {
1317 // Pass it down as 0 ptr.
1318 tensor_arena->buffers[i].ptr = 0;
1319 PRINT(CCV_CLI_VERBOSE, "|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Cannot assign buffer %d, it points to multiple blocks (multi view tensor required)\n"
, i); fflush(stdout); } } while (0)
;
1320 continue;
1321 }
1322 const uint64_t offset = p_alloc_prep->blocks[vt_ref].offset;
1323 tensor_arena->buffers[i].ptr = p_arena->buffers[buffer_ref].ptr + offset;
1324 PRINT(CCV_CLI_VERBOSE, "|-Assign block %d in parent arena to buffer %d with offset %lu\n", vt_ref, i, (unsigned long)offset)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Assign block %d in parent arena to buffer %d with offset %lu\n"
, vt_ref, i, (unsigned long)offset); fflush(stdout); } } while
(0)
;
1325 }
1326 } else {
1327 // Now, allocate actual buffers.
1328 PRINT(CCV_CLI_VERBOSE, "Buffer allocation for arena %p\n", tensor_arena)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("Buffer allocation for arena %p\n", tensor_arena); fflush
(stdout); } } while (0)
;
1329 for (i = 0; i < tensor_arena->buffer_size; i++)
1330 {
1331 const int buffer_type = tensor_arena->buffers[i].type;
1332 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
1333 if (tensor_arena->buffers[i].size == 0)
1334 {
1335 tensor_arena->buffers[i].ptr = 0;
1336 PRINT(CCV_CLI_VERBOSE, "|-Skip buffer %d with size 0\n", i)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Skip buffer %d with size 0\n", i); fflush(stdout
); } } while (0)
;
1337 continue;
1338 }
1339#ifdef HAVE_CUDA1
1340 if (memory_type == CCV_TENSOR_GPU_MEMORY)
1341 {
1342 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1343 if (allocator.isa && allocator.isa->alloc)
1344 tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1345 else
1346 tensor_arena->buffers[i].ptr = (uint8_t*)cumalloc(device_id, tensor_arena->buffers[i].size);
1347 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1348 } else {
1349 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1349, __extension__ __PRETTY_FUNCTION__
); }))
;
1350 if (tensor_arena->buffers[i].pin_mem)
1351 tensor_arena->buffers[i].ptr = (uint8_t*)cuhostalloc(tensor_arena->buffers[i].size);
1352 else
1353 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1354 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1355 }
1356#elif defined(HAVE_MPS)
1357 if (memory_type == CCV_TENSOR_GPU_MEMORY)
1358 {
1359 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
1360 // if (allocator.isa && allocator.isa->alloc)
1361 // tensor_arena->buffers[i].ptr = (uint8_t*)allocator.isa->alloc(buffer_type, 0, tensor_arena->buffers[i].size, allocator.context.alloc);
1362 // else
1363 tensor_arena->buffers[i].ptr = (uint8_t*)mpheapalloc(device_id, tensor_arena->buffers[i].size);
1364 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1365 } else {
1366 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1366, __extension__ __PRETTY_FUNCTION__
); }))
;
1367 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1368 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1369 }
1370#else
1371 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 1371, __extension__ __PRETTY_FUNCTION__
); }))
;
1372 ccmemalignposix_memalign((void**)&tensor_arena->buffers[i].ptr, 64, tensor_arena->buffers[i].size);
1373 PRINT(CCV_CLI_VERBOSE, "|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena->buffers[i].ptr, (unsigned long)tensor_arena->buffers[i].size)do { if ((CCV_CLI_VERBOSE & ccv_cli_get_output_levels()))
{ printf("|-Allocate buffer %d with ptr %p, size %lu\n", i, tensor_arena
->buffers[i].ptr, (unsigned long)tensor_arena->buffers[
i].size); fflush(stdout); } } while (0)
;
1374#endif
1375 assert(tensor_arena->buffers[i].ptr)((void) sizeof ((tensor_arena->buffers[i].ptr) ? 1 : 0), __extension__
({ if (tensor_arena->buffers[i].ptr) ; else __assert_fail
("tensor_arena->buffers[i].ptr", "ccv_nnc_symbolic_graph_compile.c"
, 1375, __extension__ __PRETTY_FUNCTION__); }))
;
1376 }
1377 }
1378 // Go over sub_preps and allocate arenas for them. Do it this early because
1379 // we may reference tensors from sub arenas, the reason why we need to reference
1380 // tensors from sub arenas is because for output tensors, sub arena's tensor
1381 // will have automatic reference updates.
1382 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1383 if (graph_prep->sub_preps[i])
1384 tensor_arena->sub_arenas[i] = _ccv_nnc_tensor_arena_new(graph_prep->sub_preps[i], allocator, tensor_arena, tensor_binds, tensor_bind_size);
1385 else
1386 tensor_arena->sub_arenas[i] = 0;
1387 memset(tensor_arena->vt_tensors, 0, sizeof(ccv_nnc_tensor_t*) * tensor_symbol_info_size);
1388 // Now sub-arenas are all assigned, go over its outputs to assign out tensors from its output directly.
1389 ccv_nnc_tensor_t** sub_arena_out_tensors = tensor_arena->sub_arena_size ? (ccv_nnc_tensor_t**)cccalloccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_tensor_t*)) : 0;
1390#ifdef HAVE_MPS
1391 khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = kh_init(obj_ptr)kh_init_obj_ptr();
1392#else
1393 khash_t(obj_ptr)kh_obj_ptr_t* obj_ptr_map = 0;
1394#endif
1395 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1396 if (tensor_arena->sub_arenas[i])
1397 {
1398 assert(graph_prep->sub_preps[i])((void) sizeof ((graph_prep->sub_preps[i]) ? 1 : 0), __extension__
({ if (graph_prep->sub_preps[i]) ; else __assert_fail ("graph_prep->sub_preps[i]"
, "ccv_nnc_symbolic_graph_compile.c", 1398, __extension__ __PRETTY_FUNCTION__
); }))
;
1399 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1400 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1401 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1402 for (j = 0; j < node->output_size; j++)
1403 {
1404 const int idx = node->outputs[j];
1405 const int s_idx = *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1;
1406 assert(s_idx >= 0)((void) sizeof ((s_idx >= 0) ? 1 : 0), __extension__ ({ if
(s_idx >= 0) ; else __assert_fail ("s_idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1406, __extension__ __PRETTY_FUNCTION__); }))
;
1407 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1408 assert(sub_arena_out_tensors[idx] == 0)((void) sizeof ((sub_arena_out_tensors[idx] == 0) ? 1 : 0), __extension__
({ if (sub_arena_out_tensors[idx] == 0) ; else __assert_fail
("sub_arena_out_tensors[idx] == 0", "ccv_nnc_symbolic_graph_compile.c"
, 1408, __extension__ __PRETTY_FUNCTION__); }))
;
1409 ccv_nnc_tensor_t* sub_alias = (ccv_nnc_tensor_t*)sub_tensor->alias_ref;
1410 // Only assign if it is a multiview tensor.
1411 if (CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) ||
1412 (sub_alias && CCV_IS_TENSOR_MULTIVIEW(sub_alias)((*(int*)(sub_alias)) & CCV_TENSOR_MULTIVIEW)))
1413 sub_arena_out_tensors[idx] = sub_tensor;
1414 }
1415 }
1416 // Assigning out the tensors (in case of sharing tensors / in-place ops).
1417 for (i = 0; i < tensor_symbol_info_size; i++)
1418 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
1419 {
1420 const int vt_ref = alloc_prep->vt_blocks[i];
1421 const int buffer_ref = vt_ref >= 0 ? alloc_prep->blocks[vt_ref].buffer_ref : -1;
1422 // Either we have dup_tensor_block_ref in current layer, or we have that in
1423 // previous layer, therefore, cannot really find the buffer ptr.
1424 if ((!sub_arena_out_tensors || !sub_arena_out_tensors[i]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1425 ((graph_prep->dup_tensor_block_ref &&
1426 graph_prep->dup_tensor_block_ref[i * unroll_count] >= 0 &&
1427 graph_prep->dup_tensor_block_ref[i * unroll_count] != i) ||
1428 (buffer_ref >= 0 && !tensor_arena->buffers[buffer_ref].ptr)))
1429 {
1430 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1430, __extension__ __PRETTY_FUNCTION__
); }))
; // This must be in a sub-graph.
1431 // If this is an input tensor, and it need to be preserved, wait until when we go through inputs to preserve.
1432 if (graph_prep->tensor_blocks[i].p_refs[0] && _ccv_nnc_tensor_block_check_preserve(graph_prep, i))
1433 continue;
1434 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 0, tensor_symbol_info[i].assign_ref, tensor_symbol_info[i].info, graph_prep, tensor_arena, i);
1435 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1436 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1437 } else if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED)) {
1438 // When we want to allocate, we don't really need to if it need force broadcast, because we will handle that later.
1439 const uint64_t offset = alloc_prep->blocks[vt_ref].offset;
1440 // If already created, use the same tensor, and continue.
1441 // Having ptr.
1442 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1443 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1444 // Also, set its allocations.
1445 // Since tensor view is bit compatible with tensor, we can just cast.
1446 void* obj = _ccv_nnc_tensor_arena_obj_create(obj_ptr_map, tensor_arena->buffers[buffer_ref].ptr, tensor_arena->buffers[buffer_ref].size, offset, tensor_symbol_info[i].info, tensor_arena);
1447 *tensor = ccv_nnc_tensor(obj, tensor_symbol_info[i].info, 0);
1448 assert(offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size)((void) sizeof ((offset + tensor_blocks[i].size <= tensor_arena
->buffers[buffer_ref].size) ? 1 : 0), __extension__ ({ if (
offset + tensor_blocks[i].size <= tensor_arena->buffers
[buffer_ref].size) ; else __assert_fail ("offset + tensor_blocks[i].size <= tensor_arena->buffers[buffer_ref].size"
, "ccv_nnc_symbolic_graph_compile.c", 1448, __extension__ __PRETTY_FUNCTION__
); }))
;
1449 // If we need to force broadcast, we need to wrap it in a multiview.
1450 if (graph_prep->tensor_blocks[i].p_refs[0] &&
1451 _ccv_nnc_tensor_block_check_force_broadcast(graph_prep, i))
1452 {
1453 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1454 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1455 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1456 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1457 tv,
1458 }, 0, 1, graph_prep->graph, mv);
1459 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1460 pos = mv_pos;
1461 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1462 }
1463 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)pos; // Cast into vt_tensors for now, and later will rewire it.
1464 }
1465 }
1466#ifdef HAVE_MPS
1467 kh_destroy(obj_ptr, obj_ptr_map)kh_destroy_obj_ptr(obj_ptr_map);
1468#endif
1469 // Handle binded tensors. First handle cases without aliases.
1470 for (i = 0; i < tensor_bind_size; i++)
1471 {
1472 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1472, __extension__ __PRETTY_FUNCTION__
); }))
;
1473 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1474 if (resolved_symbol.d >= 0)
1475 {
1476 int d = resolved_symbol.d;
1477 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1478 continue;
1479 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1480 // It has nothing to do with alias.
1481 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1482 d = tensor_blocks[d].ref - 1;
1483 // For binded tensors, it shouldn't be assigned yet.
1484 // If it is assigned, the pointer should match the ones from the binded tensor.
1485 // This can only happen if an enforced in-place tensor is binded twice. If that
1486 // happens, we need to make sure it is binded to the same location.
1487 assert(!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((!tensor_arena->vt_tensors[d] || tensor_arena
->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->
data.u8) ? 1 : 0), __extension__ ({ if (!tensor_arena->vt_tensors
[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("!tensor_arena->vt_tensors[d] || tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1487, __extension__ __PRETTY_FUNCTION__
); }))
;
1488 // See above assertion.
1489 if (tensor_arena->vt_tensors[d])
1490 continue;
1491 if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1492 {
1493 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1494 ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1495 ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1496 if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1497 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1498 { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1498, __extension__ __PRETTY_FUNCTION__
); }))
; }
1499 // It is OK to be just as a whole smaller or equal to the binded one.
1500 assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1500, __extension__ __PRETTY_FUNCTION__
); }))
;
1501 memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1502 memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1503 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1504 } else {
1505 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1506 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1507 *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1508 tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1509 tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1510 tv->data = tensor_binds[i].tensor->data; // If there are offsets, copy it over.
1511 tv->dataof = tensor_binds[i].tensor->dataof;
1512 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1513 }
1514 }
1515 }
1516 // Handle binded tensors. We handle alias here so it can reference to binded tensors.
1517 for (i = 0; i < tensor_bind_size; i++)
1518 {
1519 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1519, __extension__ __PRETTY_FUNCTION__
); }))
;
1520 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1521 if (resolved_symbol.d >= 0)
1522 {
1523 int d = resolved_symbol.d;
1524 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
1525 d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
1526 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1527 // It has nothing to do with alias.
1528 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1529 d = tensor_blocks[d].ref - 1;
1530 if (tensor_arena->vt_tensors[d])
1531 continue;
1532 // Assert original alias has no ofs. Otherwise our binding will be problematic.
1533 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1534 { assert(tensor_symbol_info[resolved_symbol.d].ofs[j] == 0)((void) sizeof ((tensor_symbol_info[resolved_symbol.d].ofs[j]
== 0) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[resolved_symbol
.d].ofs[j] == 0) ; else __assert_fail ("tensor_symbol_info[resolved_symbol.d].ofs[j] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1534, __extension__ __PRETTY_FUNCTION__
); }))
; }
1535 if (CCV_IS_TENSOR_VIEW(tensor_binds[i].tensor)((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_VIEW))
1536 {
1537 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_view_t));
1538 ccv_nnc_tensor_view_t* const tv = (ccv_nnc_tensor_view_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1539 ccv_nnc_tensor_view_t* const otv = (ccv_nnc_tensor_view_t*)tensor_binds[i].tensor;
1540 if (otv->off > 0) // If there is a off. This has to be the same dimensionality, or smaller at each dimension.
1541 for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC(12); j++)
1542 { assert(tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j])((void) sizeof ((tensor_symbol_info[d].info.dim[j] <= otv->
info.dim[j]) ? 1 : 0), __extension__ ({ if (tensor_symbol_info
[d].info.dim[j] <= otv->info.dim[j]) ; else __assert_fail
("tensor_symbol_info[d].info.dim[j] <= otv->info.dim[j]"
, "ccv_nnc_symbolic_graph_compile.c", 1542, __extension__ __PRETTY_FUNCTION__
); }))
; }
1543 // It is OK to be just as a whole smaller or equal to the binded one.
1544 assert(ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info))((void) sizeof ((ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ? 1 : 0), __extension__ ({ if (
ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count
(tensor_symbol_info[d].info)) ; else __assert_fail ("ccv_nnc_tensor_count(otv->info) >= ccv_nnc_tensor_count(tensor_symbol_info[d].info)"
, "ccv_nnc_symbolic_graph_compile.c", 1544, __extension__ __PRETTY_FUNCTION__
); }))
;
1545 memcpy(tv, otv, sizeof(ccv_nnc_tensor_view_t));
1546 memcpy(tv->info.dim, tensor_symbol_info[d].info.dim, sizeof(tv->info.dim));
1547 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1548 } else {
1549 int pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1550 ccv_nnc_tensor_t* const tv = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, pos);
1551 *tv = ccv_nnc_tensor(tensor_binds[i].tensor->data.u8, tensor_symbol_info[d].info, 0);
1552 tv->info.datatype = tensor_binds[i].tensor->info.datatype;
1553 tv->info.reserved = tensor_binds[i].tensor->info.reserved;
1554 tv->data = tensor_binds[i].tensor->data;
1555 tv->dataof = tensor_binds[i].tensor->dataof;
1556 tensor_arena->vt_tensors[d] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1557 }
1558 }
1559 }
1560 // Assign out refs, refs are simple ones, we should handle it first. (because they point to exactly the same metadata and same region).
1561 // Avoiding refs that actually is an alias.
1562 for (i = 0; i < tensor_symbol_info_size; i++)
1563 // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1564 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].ref && !tensor_arena->vt_tensors[i] && !tensor_blocks[i].alias_ref)
1565 {
1566 int ref = tensor_blocks[i].ref - 1;
1567 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && tensor_blocks[ref].ref)
1568 ref = tensor_blocks[ref].ref - 1;
1569 assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1569, __extension__ __PRETTY_FUNCTION__); }))
;
1570 tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1571 }
1572 // Now after refs assigned out, handle the case I need to preserve because I am a sub graph of while loop.
1573 if (graph_prep->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1574 {
1575 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 1575, __extension__ __PRETTY_FUNCTION__
); }))
;
1576 const ccv_nnc_graph_exec_symbol_info_t* node = graph_prep->p->exec_symbol_info + (graph_prep->exec_idx - 1);
1577 const int p_idx = graph_prep->p_idx - 1;
1578 for (i = 0; i < node->input_size; i++)
1579 {
1580 const int idx = node->inputs[i];
1581 int block_ref = *(int*)ccv_array_get(graph_prep->p->tensor_symbol_info[idx].s_ref, p_idx)((void*)(((char*)((graph_prep->p->tensor_symbol_info[idx
].s_ref)->data)) + (size_t)(graph_prep->p->tensor_symbol_info
[idx].s_ref)->rsize * (size_t)(p_idx)))
- 1;
1582 assert(!tensor_blocks[block_ref].ref)((void) sizeof ((!tensor_blocks[block_ref].ref) ? 1 : 0), __extension__
({ if (!tensor_blocks[block_ref].ref) ; else __assert_fail (
"!tensor_blocks[block_ref].ref", "ccv_nnc_symbolic_graph_compile.c"
, 1582, __extension__ __PRETTY_FUNCTION__); }))
;
1583 const int vt_ref = alloc_prep->vt_blocks[block_ref];
1584 if (!_ccv_nnc_tensor_block_check_preserve(graph_prep, block_ref))
1585 continue;
1586 assert(vt_ref >= 0)((void) sizeof ((vt_ref >= 0) ? 1 : 0), __extension__ ({ if
(vt_ref >= 0) ; else __assert_fail ("vt_ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 1586, __extension__ __PRETTY_FUNCTION__); }))
;
1587 const int buffer_ref = alloc_prep->blocks[vt_ref].buffer_ref;
1588 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[block_ref].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])", "ccv_nnc_symbolic_graph_compile.c"
, 1588, __extension__ __PRETTY_FUNCTION__); }))
;
1589 assert(!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref]))((void) sizeof ((!((tensor_blocks[block_ref].flags & 0x3)
== ALIAS)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks[block_ref
].flags & 0x3) == ALIAS)) ; else __assert_fail ("!TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 1589, __extension__ __PRETTY_FUNCTION__
); }))
;
1590 // Either we have dup_tensor_block_ref in current layer, or we have that in
1591 // previous layer, therefore, cannot really find the buffer ptr.
1592 if ((!sub_arena_out_tensors || !sub_arena_out_tensors[block_ref]) && // If it is already generated by sub arena, it can be ordinary out tensors. (What if out tensor is not even generated by sub graph when running? In this case, the behavior is undefined anyway).
1593 ((graph_prep->dup_tensor_block_ref &&
1594 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] >= 0 &&
1595 graph_prep->dup_tensor_block_ref[block_ref * unroll_count] != block_ref) ||
1596 !tensor_arena->buffers[buffer_ref].ptr))
1597 {
1598 // We haven't allocated anything for this yet.
1599 assert(tensor_arena->vt_tensors[block_ref] == 0)((void) sizeof ((tensor_arena->vt_tensors[block_ref] == 0)
? 1 : 0), __extension__ ({ if (tensor_arena->vt_tensors[block_ref
] == 0) ; else __assert_fail ("tensor_arena->vt_tensors[block_ref] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 1599, __extension__ __PRETTY_FUNCTION__
); }))
;
1600 const int pos = _ccv_nnc_tensor_multiview_gen(tensor_arena->tensor_metadata, 1, tensor_symbol_info[i].assign_ref, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena, block_ref);
1601 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)pos;
1602 ccv_array_push(tensor_arena->m_tensor_idx, &pos);
1603 } else {
1604 const int mv_pos = _ccv_nnc_tensor_multiview_preserve_gen(tensor_arena->tensor_metadata, tensor_symbol_info[block_ref].info, graph_prep, tensor_arena->vt_tensors[block_ref]);
1605 tensor_arena->vt_tensors[block_ref] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos; // Cast into vt_tensors for now, and later will rewire.
1606 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1607 }
1608 }
1609 }
1610 // For case..of statement, the output is a phi variable, thus, if we take the skip branch, we will select the original input.
1611 // This created the multi-view tensor to achieve that.
1612 for (i = 0; i < tensor_symbol_info_size; i++)
1613 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1614 {
1615 const int bypass_ref = tensor_blocks[i].bypass_ref - 1;
1616 // Create phi multi-view.
1617 const int mv_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_multiview_t));
1618 const int intv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[bypass_ref]);
1619 const int outv_pos = _ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1620 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, mv_pos);
1621 ccv_nnc_tensor_t* const intv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, intv_pos);
1622 ccv_nnc_tensor_t* const outv = (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, outv_pos);
1623 ccv_nnc_tensor_multiview((ccv_nnc_tensor_t*[]){
1624 intv,
1625 outv,
1626 }, CCV_NNC_MULTIVIEW_K0N, 2, (ccv_nnc_graph_t*)CCV_NNC_MULTIVIEW_PHI(intptr_t)0x1, mv);
1627 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0] = (ccv_nnc_tensor_t*)(intptr_t)intv_pos;
1628 CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[1] = (ccv_nnc_tensor_t*)(intptr_t)outv_pos;
1629 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)mv_pos;
1630 ccv_array_push(tensor_arena->m_tensor_idx, &mv_pos);
1631 }
1632 // Now it is time to handle alias.
1633 for (i = 0; i < alloc_prep->block_size; i++)
1634 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1635 {
1636 const int block_ref = alloc_prep->blocks[i].block_ref;
1637 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS) && !tensor_arena->vt_tensors[block_ref])
1638 {
1639 // Assigning out the tensor aliases.
1640 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1640, __extension__ __PRETTY_FUNCTION__
); }))
;
1641 _ccv_nnc_recursively_assign_vt_tensor_aliases(tensor_blocks, tensor_arena->tensor_metadata, tensor_symbol_info, block_ref, tensor_arena->vt_tensors);
1642 }
1643 }
1644 // Now assigning out the rest of alias refs.
1645 for (i = 0; i < tensor_symbol_info_size; i++)
1646 // It could be binded tensor (or unused), in that case, it doesn't have a ref.
1647 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_blocks[i].alias_ref && !tensor_arena->vt_tensors[i])
1648 {
1649 int ref = tensor_blocks[i].alias_ref - 1;
1650 assert(tensor_arena->vt_tensors[ref])((void) sizeof ((tensor_arena->vt_tensors[ref]) ? 1 : 0), __extension__
({ if (tensor_arena->vt_tensors[ref]) ; else __assert_fail
("tensor_arena->vt_tensors[ref]", "ccv_nnc_symbolic_graph_compile.c"
, 1650, __extension__ __PRETTY_FUNCTION__); }))
;
1651 tensor_arena->vt_tensors[i] = tensor_arena->vt_tensors[ref];
1652 }
1653 // Replacing the tensor placeholder within sub arena's multi-view to the input tensor.
1654 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1655 if (tensor_arena->sub_arenas[i])
1656 {
1657 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1658 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1659 for (j = 0; j < node->input_size; j++)
1660 {
1661 const int idx = node->inputs[j];
1662 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1663 if (s_idx < 0)
1664 continue;
1665 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1666 // Only do the replacement if it is a multi-view tensor.
1667 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1668 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1669 {
1670 // It cannot be binded tensor.
1671 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[idx
]) & 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[idx]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[idx])"
, "ccv_nnc_symbolic_graph_compile.c", 1671, __extension__ __PRETTY_FUNCTION__
); }))
;
1672 const int vt_pos = (int)(intptr_t)tensor_arena->vt_tensors[idx];
1673 const int is_sub_arena_out_tensor = (sub_arena_out_tensors && sub_arena_out_tensors[idx]);
1674 ccv_nnc_tensor_t* const vt_tensor = is_sub_arena_out_tensor ? sub_arena_out_tensors[idx] : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos);
1675 // If this tensor is also an multiview, we need to first generate a new tensor, and then generate a reference
1676 // to this tensor.
1677 if (CCV_IS_TENSOR_MULTIVIEW(vt_tensor)((*(int*)(vt_tensor)) & CCV_TENSOR_MULTIVIEW))
1678 {
1679 const int ref_pos = _ccv_nnc_tensor_metadata_pos_new(tensor_arena->tensor_metadata, sizeof(ccv_nnc_tensor_t));
1680 ccv_nnc_tensor_t* const ref_tensor = _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, ref_pos);
1681 ccv_nnc_tensor_multiview_t* const multiview = (ccv_nnc_tensor_multiview_t*)(is_sub_arena_out_tensor ? vt_tensor : _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, vt_pos));
1682 ref_tensor->alias_ref = is_sub_arena_out_tensor ? (uintptr_t)vt_tensor : (uintptr_t)vt_pos;
1683 ccv_nnc_tensor_synchronize_to_multiview(multiview, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1684 ccv_nnc_tensor_t* tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA(multiview)[0])((uintptr_t)(((multiview)->_heap_data ? (multiview)->_heap_data
: (multiview)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA(multiview)((multiview)->_heap_data ? (multiview)->_heap_data : (multiview
)->_inline_data)
[0]);
1685 while (CCV_IS_TENSOR_MULTIVIEW(tv)((*(int*)(tv)) & CCV_TENSOR_MULTIVIEW))
1686 tv = (ccv_nnc_tensor_t*)(CCV_NNC_IS_METADATA_POS(CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)[0])((uintptr_t)((((ccv_nnc_tensor_multiview_t*)tv)->_heap_data
? ((ccv_nnc_tensor_multiview_t*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t
*)tv)->_inline_data)[0]) & 1)
? _ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]) : CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)tv)(((ccv_nnc_tensor_multiview_t*)tv)->_heap_data ? ((ccv_nnc_tensor_multiview_t
*)tv)->_heap_data : ((ccv_nnc_tensor_multiview_t*)tv)->
_inline_data)
[0]);
1687 *ref_tensor = ccv_nnc_tensor(tv->data.u8, tv->info, 0);
1688 ref_tensor->data = tv->data;
1689 ref_tensor->dataof = tv->dataof;
1690 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, (ccv_nnc_tensor_t*)(intptr_t)ref_pos);
1691 } else
1692 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, is_sub_arena_out_tensor ? vt_tensor : (ccv_nnc_tensor_t*)(intptr_t)vt_pos);
1693 }
1694 }
1695 }
1696 // After alias created, for case..of statement, we now revert back to flat tensor rather than multi-view.
1697 // No worries though, this new tensor is subscribed for the phi multi-view. More over, we have logic
1698 // when initialize case..of node, which will take the phi multi-view again.
1699 for (i = 0; i < tensor_symbol_info_size; i++)
1700 if (tensor_blocks[i].bypass_ref && tensor_arena->vt_tensors[i])
1701 {
1702 assert(CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i]))((void) sizeof ((((uintptr_t)(tensor_arena->vt_tensors[i])
& 1)) ? 1 : 0), __extension__ ({ if (((uintptr_t)(tensor_arena
->vt_tensors[i]) & 1)) ; else __assert_fail ("CCV_NNC_IS_METADATA_POS(tensor_arena->vt_tensors[i])"
, "ccv_nnc_symbolic_graph_compile.c", 1702, __extension__ __PRETTY_FUNCTION__
); }))
;
1703 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1704 assert(mv->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((mv->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (mv->anchor == (intptr_t)0x1) ; else __assert_fail (
"mv->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1704, __extension__ __PRETTY_FUNCTION__); }))
;
1705 tensor_arena->vt_tensors[i] = (ccv_nnc_tensor_t*)(intptr_t)_ccv_nnc_tensor_flat_if_multiview(tensor_arena->tensor_metadata, (int)(intptr_t)tensor_arena->vt_tensors[i]);
1706 }
1707 // rewire the rest. I can rewire multiple times because I can identify whether this is wired or not.
1708 for (i = 0; i < tensor_symbol_info_size; i++)
1709 if (tensor_arena->vt_tensors[i])
1710 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_metadata_rewire(tensor_arena->tensor_metadata, tensor_arena->vt_tensors[i]);
1711 // Associate multiview tensors from sub arena to the parent.
1712 if (sub_arena_out_tensors)
1713 {
1714 for (i = 0; i < alloc_prep->block_size; i++)
1715 if (alloc_prep->blocks[i].block_ref < tensor_symbol_info_size)
1716 {
1717 const int block_ref = alloc_prep->blocks[i].block_ref;
1718 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == UNASSIGNED))
1719 continue;
1720 int sub_arena_ref = block_ref;
1721 if (TENSOR_EXPECT_ALIAS(tensor_blocks[block_ref])((tensor_blocks[block_ref].flags & 0x3) == ALIAS))
1722 {
1723 // Assigning out the tensor aliases.
1724 assert(tensor_symbol_info[block_ref].alias_ref)((void) sizeof ((tensor_symbol_info[block_ref].alias_ref) ? 1
: 0), __extension__ ({ if (tensor_symbol_info[block_ref].alias_ref
) ; else __assert_fail ("tensor_symbol_info[block_ref].alias_ref"
, "ccv_nnc_symbolic_graph_compile.c", 1724, __extension__ __PRETTY_FUNCTION__
); }))
;
1725 const int alias_ref = tensor_symbol_info[block_ref].alias_ref - 1;
1726 // It referenced to is not an alias.
1727 assert(tensor_arena->vt_tensors[alias_ref])((void) sizeof ((tensor_arena->vt_tensors[alias_ref]) ? 1 :
0), __extension__ ({ if (tensor_arena->vt_tensors[alias_ref
]) ; else __assert_fail ("tensor_arena->vt_tensors[alias_ref]"
, "ccv_nnc_symbolic_graph_compile.c", 1727, __extension__ __PRETTY_FUNCTION__
); }))
;
1728 sub_arena_ref = alias_ref;
1729 if (!sub_arena_out_tensors[sub_arena_ref])
1730 continue;
1731 }
1732 if (!sub_arena_out_tensors[sub_arena_ref])
1733 continue;
1734 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[sub_arena_ref])((*(int*)(sub_arena_out_tensors[sub_arena_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[sub_arena_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[sub_arena_ref]->alias_ref);
1735 assert(CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW)) ?
1 : 0), __extension__ ({ if (((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("CCV_IS_TENSOR_MULTIVIEW(mv)", "ccv_nnc_symbolic_graph_compile.c"
, 1735, __extension__ __PRETTY_FUNCTION__); }))
;
1736 // This is only possible if the vt_tensors is a phi node.
1737 if (tensor_arena->vt_tensors[block_ref]->alias_ref)
1738 {
1739 // For phi node, the sub_arena_out_tensors are only relevant to its selected output. Therefore, setting that to be the receiver of the broadcast.
1740 ccv_nnc_tensor_multiview_t* const phi = (ccv_nnc_tensor_multiview_t*)(tensor_arena->vt_tensors[block_ref]->alias_ref);
1741 assert(phi->anchor == CCV_NNC_MULTIVIEW_PHI)((void) sizeof ((phi->anchor == (intptr_t)0x1) ? 1 : 0), __extension__
({ if (phi->anchor == (intptr_t)0x1) ; else __assert_fail
("phi->anchor == CCV_NNC_MULTIVIEW_PHI", "ccv_nnc_symbolic_graph_compile.c"
, 1741, __extension__ __PRETTY_FUNCTION__); }))
;
1742 assert(!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1]))((void) sizeof ((!((*(int*)(((phi)->_heap_data ? (phi)->
_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(((phi)->_heap_data
? (phi)->_heap_data : (phi)->_inline_data)[1])) & CCV_TENSOR_MULTIVIEW
)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(phi)[1])"
, "ccv_nnc_symbolic_graph_compile.c", 1742, __extension__ __PRETTY_FUNCTION__
); }))
;
1743 CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]->alias_ref = (uintptr_t)mv;
1744 ccv_nnc_tensor_synchronize_to_multiview(mv, CCV_NNC_MULTIVIEW_DATA(phi)((phi)->_heap_data ? (phi)->_heap_data : (phi)->_inline_data
)
[1]);
1745 } else {
1746 tensor_arena->vt_tensors[block_ref]->alias_ref = (uintptr_t)mv;
1747 ccv_nnc_tensor_synchronize_to_multiview(mv, tensor_arena->vt_tensors[block_ref]);
1748 }
1749 }
1750 }
1751 // Go over all the tensors that has assign_ref. If the tensor it is assigned from is:
1752 // 1). From sub_arena_out_tensors, it could be possible that it now pointing to an area this arena doesn't know.
1753 // 2). From phi multi-view, for this case, it is in fact that this arena won't know which memory I am going to use prior.
1754 // Therefore, for above two scenarios, the tensor has assign_ref, even it is a multiview tensor, need to subscribe
1755 // to the output of assign_ref tensor.
1756 for (i = 0; i < tensor_symbol_info_size; i++)
1757 if (tensor_arena->vt_tensors[i] && tensor_symbol_info[i].assign_ref)
1758 {
1759 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
1760 ccv_nnc_tensor_t* assign_tensor;
1761 if (sub_arena_out_tensors && sub_arena_out_tensors[assign_ref])
1762 assign_tensor = CCV_IS_TENSOR_MULTIVIEW(sub_arena_out_tensors[assign_ref])((*(int*)(sub_arena_out_tensors[assign_ref])) & CCV_TENSOR_MULTIVIEW
)
? sub_arena_out_tensors[assign_ref] : (ccv_nnc_tensor_t*)sub_arena_out_tensors[assign_ref]->alias_ref;
1763 else
1764 assign_tensor = tensor_arena->vt_tensors[assign_ref];
1765 ccv_nnc_graph_add_carry_over(graph_prep->graph, assign_tensor, tensor_arena->vt_tensors[i]);
1766 }
1767 // After everything handled, assertion again to make sure the tensors and tensor binds pointing to the right location. This is really just for assertion.
1768 for (i = 0; i < tensor_bind_size; i++)
1769 {
1770 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 1770, __extension__ __PRETTY_FUNCTION__
); }))
;
1771 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(graph_prep->symbolic_graph, tensor_binds[i].symbol);
1772 if (resolved_symbol.d >= 0)
1773 {
1774 int d = resolved_symbol.d;
1775 // This check is for in-place ops. Only in-place op could have unassigned but ref.
1776 // It has nothing to do with alias.
1777 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
1778 d = tensor_blocks[d].ref - 1;
1779 // Note we don't trace back on alias. This is intentional.
1780 assert(tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8)((void) sizeof ((tensor_arena->vt_tensors[d]->data.u8 ==
tensor_binds[i].tensor->data.u8) ? 1 : 0), __extension__ (
{ if (tensor_arena->vt_tensors[d]->data.u8 == tensor_binds
[i].tensor->data.u8) ; else __assert_fail ("tensor_arena->vt_tensors[d]->data.u8 == tensor_binds[i].tensor->data.u8"
, "ccv_nnc_symbolic_graph_compile.c", 1780, __extension__ __PRETTY_FUNCTION__
); }))
;
1781 }
1782 }
1783 if (sub_arena_out_tensors)
1784 ccfreefree(sub_arena_out_tensors);
1785 // Rewire sub arena's tensor references.
1786 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1787 if (tensor_arena->sub_arenas[i])
1788 {
1789 const int exec_idx = graph_prep->sub_preps[i]->exec_idx - 1;
1790 const ccv_nnc_graph_exec_symbol_info_t* const node = graph_prep->exec_symbol_info + exec_idx;
1791 for (j = 0; j < node->input_size; j++)
1792 {
1793 const int idx = node->inputs[j];
1794 const int s_idx = (tensor_symbol_info[idx].s_ref && tensor_symbol_info[idx].s_ref->rnum > i) ? *(int*)ccv_array_get(tensor_symbol_info[idx].s_ref, i)((void*)(((char*)((tensor_symbol_info[idx].s_ref)->data)) +
(size_t)(tensor_symbol_info[idx].s_ref)->rsize * (size_t)
(i)))
- 1 : -1;
1795 if (s_idx < 0)
1796 continue;
1797 ccv_nnc_tensor_t* sub_tensor = tensor_arena->sub_arenas[i]->vt_tensors[s_idx];
1798 // Only do the replacement if it is a multi-view tensor.
1799 // sub_tensor can be unassigned if it is a tape variable. It will get fixed up later from its pair.
1800 if (sub_tensor && CCV_IS_TENSOR_MULTIVIEW(sub_tensor)((*(int*)(sub_tensor)) & CCV_TENSOR_MULTIVIEW))
1801 {
1802 // This is binded tensor, bind it now.
1803 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[idx])((tensor_blocks[idx].flags & 0x3) == UNASSIGNED))
1804 _ccv_nnc_tensor_multiview_full_pos((ccv_nnc_tensor_multiview_t*)sub_tensor, tensor_arena->vt_tensors[idx]);
1805 else
1806 _ccv_nnc_tensor_multiview_full_pos_rewire(tensor_arena->tensor_metadata, (ccv_nnc_tensor_multiview_t*)sub_tensor);
1807 }
1808 }
1809 }
1810 return tensor_arena;
1811}
1812
1813static ccv_nnc_tensor_t* _ccv_nnc_tensor_arena_find_pair_ref(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const int pair_ref)
1814{
1815 assert(graph)((void) sizeof ((graph) ? 1 : 0), __extension__ ({ if (graph)
; else __assert_fail ("graph", "ccv_nnc_symbolic_graph_compile.c"
, 1815, __extension__ __PRETTY_FUNCTION__); }))
;
1816 if ((intptr_t)graph == tensor_arena->graph_ref)
1817 {
1818 assert(pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size)((void) sizeof ((pair_ref >= 0 && pair_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (pair_ref >=
0 && pair_ref < tensor_arena->vt_tensor_size) ;
else __assert_fail ("pair_ref >= 0 && pair_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1818, __extension__ __PRETTY_FUNCTION__
); }))
;
1819 return tensor_arena->vt_tensors[pair_ref];
1820 }
1821 int i;
1822 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1823 if (tensor_arena->sub_arenas[i])
1824 {
1825 ccv_nnc_tensor_t* const tensor = _ccv_nnc_tensor_arena_find_pair_ref(tensor_arena->sub_arenas[i], graph, pair_ref);
1826 if (tensor)
1827 return tensor;
1828 }
1829 return 0;
1830}
1831
1832static void _ccv_nnc_tensor_mark_as_tape_var(ccv_nnc_tensor_t* const tensor)
1833{
1834 if (!CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1835 tensor->type |= CCV_TAPE_ALLOC;
1836 else {
1837 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)tensor;
1838 mv->type |= CCV_TAPE_ALLOC;
1839 int i;
1840 for (i = 0; i < mv->repeat + mv->kind; i++)
1841 _ccv_nnc_tensor_mark_as_tape_var(CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i]);
1842 }
1843}
1844
1845static void _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(const ccv_nnc_tensor_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_tensor_arena_t* const tensor_arena)
1846{
1847 assert(tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)graph_prep
->symbolic_graph) ? 1 : 0), __extension__ ({ if (tensor_arena
->graph_ref == (intptr_t)graph_prep->symbolic_graph) ; else
__assert_fail ("tensor_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 1847, __extension__ __PRETTY_FUNCTION__
); }))
;
1848 int i;
1849 for (i = 0; i < graph_prep->tensor_symbol_info_size; i++)
1850 {
1851 if (graph_prep->tensor_symbol_info[i].pair_ref)
1852 {
1853 tensor_arena->vt_tensors[i] = _ccv_nnc_tensor_arena_find_pair_ref(root_arena, graph_prep->symbolic_graph->pair, graph_prep->tensor_symbol_info[i].pair_ref - 1);
1854 // No need to continue check this if it is from its pair.
1855 continue;
1856 }
1857 if ((graph_prep->tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && tensor_arena->vt_tensors[i])
1858 {
1859 // If it is a normal tensor, and the buffer it relies on is read only, no need to mark as tape var.
1860 if (!CCV_IS_TENSOR_MULTIVIEW(tensor_arena->vt_tensors[i])((*(int*)(tensor_arena->vt_tensors[i])) & CCV_TENSOR_MULTIVIEW
)
)
1861 {
1862 const int vt_ref = graph_prep->alloc_prep->vt_blocks[i];
1863 if (vt_ref >= 0 &&
1864 TENSOR_READ_WRITE(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep->blocks[vt_ref].buffer_ref])(graph_prep->alloc_prep->buffers[graph_prep->alloc_prep
->blocks[vt_ref].buffer_ref].flags & 0xc)
== READ_ONLY)
1865 continue;
1866 }
1867 _ccv_nnc_tensor_mark_as_tape_var(tensor_arena->vt_tensors[i]);
1868 }
1869 }
1870 for (i = 0; i < graph_prep->sub_prep_size; i++)
1871 if (graph_prep->sub_preps[i])
1872 _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(root_arena, graph_prep->sub_preps[i], tensor_arena->sub_arenas[i]);
1873}
1874
1875static void _ccv_nnc_tensor_block_add_exec(const ccv_sparse_matrix_t* const exec_dep, const int idx, ccv_nnc_tensor_block_t tensor_blocks)
1876{
1877 int i, found = 0;
1878 // Try to insert head.
1879 ccv_array_t* head = tensor_blocks.head;
1880 assert(head)((void) sizeof ((head) ? 1 : 0), __extension__ ({ if (head) ;
else __assert_fail ("head", "ccv_nnc_symbolic_graph_compile.c"
, 1880, __extension__ __PRETTY_FUNCTION__); }))
;
1881 for (i = 0; i < head->rnum;)
1882 {
1883 const int head_idx = *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
;
1884 if (head_idx == idx)
1885 {
1886 found = 1;
1887 break;
1888 }
1889 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, head_idx, idx);
1890 if (cell.i32 && cell.i32[0] > 0)
1891 {
1892 /* If the current node is the parent of the head node, check if we found it or not. */
1893 /* If not found, replace the current one. */
1894 if (!found)
1895 {
1896 found = 1;
1897 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= idx;
1898 } else {
1899 /* Remove the current one, change the rnum. */
1900 if (i < head->rnum - 1)
1901 *(int*)ccv_array_get(head, i)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(head, head->rnum - 1)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(head->rnum - 1)))
;
1902 --head->rnum;
1903 continue;
1904 }
1905 } else {
1906 // If the head is the parent of the idx, we cannot add it to the array (it is deterministically later than head).
1907 cell = ccv_get_sparse_matrix_cell(exec_dep, idx, head_idx);
1908 if (cell.i32 && cell.i32[0] > 0)
1909 {
1910 found = 1;
1911 break;
1912 }
1913 }
1914 /* Advancing i. */
1915 ++i;
1916 }
1917 /* If not found, push this idx to the end of the array. */
1918 if (!found)
1919 ccv_array_push(head, &idx);
1920 // Try to insert tail.
1921 found = 0;
1922 ccv_array_t* tail = tensor_blocks.tail;
1923 assert(tail)((void) sizeof ((tail) ? 1 : 0), __extension__ ({ if (tail) ;
else __assert_fail ("tail", "ccv_nnc_symbolic_graph_compile.c"
, 1923, __extension__ __PRETTY_FUNCTION__); }))
;
1924 for (i = 0; i < tail->rnum;)
1925 {
1926 const int tail_idx = *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
;
1927 if (tail_idx == idx)
1928 {
1929 found = 1;
1930 break;
1931 }
1932 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, tail_idx);
1933 if (cell.i32 && cell.i32[0] > 0)
1934 {
1935 /* If the current node is the child of the tail node, check if we found it or not. */
1936 /* If not found, replace the current one. */
1937 if (!found)
1938 {
1939 found = 1;
1940 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= idx;
1941 } else {
1942 /* Remove the current one, change the rnum. */
1943 *(int*)ccv_array_get(tail, i)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(i)))
= *(int*)ccv_array_get(tail, tail->rnum - 1)((void*)(((char*)((tail)->data)) + (size_t)(tail)->rsize
* (size_t)(tail->rnum - 1)))
;
1944 --tail->rnum;
1945 continue;
1946 }
1947 } else {
1948 // If the tail is the child of the idx, we cannot add it to the array (it is deterministically earlier than tail).
1949 cell = ccv_get_sparse_matrix_cell(exec_dep, tail_idx, idx);
1950 if (cell.i32 && cell.i32[0] > 0)
1951 {
1952 found = 1;
1953 break;
1954 }
1955 }
1956 /* Advancing i. */
1957 ++i;
1958 }
1959 /* If not found, push this idx to the end of the array. */
1960 if (!found)
1961 ccv_array_push(tail, &idx);
1962}
1963
1964ccv_nnc_tensor_t* ccv_nnc_tensor_from_symbol(const ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol)
1965{
1966 if ((intptr_t)symbol.graph == tensor_arena->graph_ref)
1967 {
1968 assert(symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d >= 0 && symbol.d < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (symbol.d >=
0 && symbol.d < tensor_arena->vt_tensor_size) ;
else __assert_fail ("symbol.d >= 0 && symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 1968, __extension__ __PRETTY_FUNCTION__
); }))
;
1969 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[symbol.d];
1970 if (tensor && CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
1971 {
1972 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
1973 while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
1974 mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
1975 return (ccv_nnc_tensor_t*)mv;
1976 }
1977 return tensor;
1978 }
1979 int i;
1980 for (i = 0; i < tensor_arena->sub_arena_size; i++)
1981 if (tensor_arena->sub_arenas[i])
1982 {
1983 ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_symbol(tensor_arena->sub_arenas[i], symbol);
1984 if (tensor)
1985 return tensor;
1986 }
1987 return 0;
1988}
1989
1990ccv_nnc_graph_exec_t ccv_nnc_graph_exec_from_symbol(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const ccv_nnc_graph_exec_symbol_t symbol)
1991{
1992 if ((intptr_t)symbol.graph == graph_exec_arena->graph_ref)
1993 {
1994 assert(symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size)((void) sizeof ((symbol.d >= 0 && symbol.d < graph_exec_arena
->graph_exec_size) ? 1 : 0), __extension__ ({ if (symbol.d
>= 0 && symbol.d < graph_exec_arena->graph_exec_size
) ; else __assert_fail ("symbol.d >= 0 && symbol.d < graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 1994, __extension__ __PRETTY_FUNCTION__
); }))
;
1995 return graph_exec_arena->graph_execs[symbol.d];
1996 }
1997 int i;
1998 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
1999 if (graph_exec_arena->sub_arenas[i])
2000 {
2001 ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena->sub_arenas[i], symbol);
2002 if (!CCV_NO_GRAPH_EXEC(exec)((exec).graph == 0))
2003 return exec;
2004 }
2005 return (ccv_nnc_graph_exec_t){}; // 0.
2006}
2007
2008ccv_nnc_graph_exec_t ccv_nnc_graph_exec_source(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2009{
2010 return graph_exec_arena->source;
2011}
2012
2013ccv_nnc_graph_exec_t ccv_nnc_graph_exec_destination(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
2014{
2015 return graph_exec_arena->destination;
2016}
2017
2018// Check whether the head is the beginning of this block.
2019static int _ccv_nnc_tensor_block_check_head(const ccv_nnc_tensor_block_t* const tensor_block, const int head_node)
2020{
2021 assert(tensor_block->head)((void) sizeof ((tensor_block->head) ? 1 : 0), __extension__
({ if (tensor_block->head) ; else __assert_fail ("tensor_block->head"
, "ccv_nnc_symbolic_graph_compile.c", 2021, __extension__ __PRETTY_FUNCTION__
); }))
;
2022 return (tensor_block->head->rnum == 1 && *(int*)ccv_array_get(tensor_block->head, 0)((void*)(((char*)((tensor_block->head)->data)) + (size_t
)(tensor_block->head)->rsize * (size_t)(0)))
== head_node);
2023}
2024
2025// Check whether the tail is the end of this block.
2026static int _ccv_nnc_tensor_block_check_tail(const ccv_nnc_tensor_block_t* const tensor_block, const int tail_node)
2027{
2028 assert(tensor_block->tail)((void) sizeof ((tensor_block->tail) ? 1 : 0), __extension__
({ if (tensor_block->tail) ; else __assert_fail ("tensor_block->tail"
, "ccv_nnc_symbolic_graph_compile.c", 2028, __extension__ __PRETTY_FUNCTION__
); }))
;
2029 return (tensor_block->tail->rnum == 1 && *(int*)ccv_array_get(tensor_block->tail, 0)((void*)(((char*)((tensor_block->tail)->data)) + (size_t
)(tensor_block->tail)->rsize * (size_t)(0)))
== tail_node);
2030}
2031
2032// Make two tensor blocks one. Return 1 if that happened.
2033static int _ccv_nnc_tensor_blocks_try_fold(ccv_nnc_tensor_block_t* const tensor_blocks, const int p_ref_0, const int p_ref_1)
2034{
2035 // Now we are sure p_ref_0 points to the input, p_ref_1 points to the output.
2036 if (!TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags & UNFOLDABLE_AS_INPUT) &&
2037 (!TENSOR_IS_UNFOLDABLE_AS_OUTPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_OUTPUT) || tensor_blocks[p_ref_1].unfoldable_except_ref == p_ref_0 + 1) &&
2038 tensor_blocks[p_ref_0].tail->rnum == 1 &&
2039 tensor_blocks[p_ref_1].head->rnum == 1 &&
2040 tensor_blocks[p_ref_0].type == tensor_blocks[p_ref_1].type && // Must be the same type.
2041 *(int*)ccv_array_get(tensor_blocks[p_ref_0].tail, 0)((void*)(((char*)((tensor_blocks[p_ref_0].tail)->data)) + (
size_t)(tensor_blocks[p_ref_0].tail)->rsize * (size_t)(0))
)
== *(int*)ccv_array_get(tensor_blocks[p_ref_1].head, 0)((void*)(((char*)((tensor_blocks[p_ref_1].head)->data)) + (
size_t)(tensor_blocks[p_ref_1].head)->rsize * (size_t)(0))
)
)
2042 {
2043 // If the two parent refs matches (thus, they meet at the same node), we can concatenate with each other and mark one as a ref. This is very similar to in-place operation combining.
2044 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0]))((void) sizeof (((!((tensor_blocks[p_ref_0].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_0].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_0].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_0].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 2044, __extension__ __PRETTY_FUNCTION__); }))
;
2045 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1]))((void) sizeof (((!((tensor_blocks[p_ref_1].flags & 0x3) ==
ALIAS) && !((tensor_blocks[p_ref_1].flags & 0x3)
== UNASSIGNED))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks
[p_ref_1].flags & 0x3) == ALIAS) && !((tensor_blocks
[p_ref_1].flags & 0x3) == UNASSIGNED))) ; else __assert_fail
("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 2045, __extension__ __PRETTY_FUNCTION__); }))
;
2046 ccv_array_free(tensor_blocks[p_ref_0].tail);
2047 tensor_blocks[p_ref_0].tail = tensor_blocks[p_ref_1].tail;
2048 if (tensor_blocks[p_ref_1].p_refs[0])
2049 {
2050 assert(tensor_blocks[p_ref_1].p_refs[1] == 0)((void) sizeof ((tensor_blocks[p_ref_1].p_refs[1] == 0) ? 1 :
0), __extension__ ({ if (tensor_blocks[p_ref_1].p_refs[1] ==
0) ; else __assert_fail ("tensor_blocks[p_ref_1].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2050, __extension__ __PRETTY_FUNCTION__
); }))
; // It simply cannot have more than one p_refs, otherwise we cannot merge.
2051 if (!tensor_blocks[p_ref_0].p_refs[0])
2052 tensor_blocks[p_ref_0].p_refs[0] = tensor_blocks[p_ref_1].p_refs[0];
2053 else
2054 tensor_blocks[p_ref_0].p_refs[1] = tensor_blocks[p_ref_1].p_refs[0];
2055 }
2056 tensor_blocks[p_ref_0].pin_mem = tensor_blocks[p_ref_0].pin_mem || tensor_blocks[p_ref_1].pin_mem;
2057 TENSOR_SET_READ_WRITE(tensor_blocks[p_ref_0], TENSOR_READ_WRITE(tensor_blocks[p_ref_0]) | TENSOR_READ_WRITE(tensor_blocks[p_ref_1]))(tensor_blocks[p_ref_0].flags = ((tensor_blocks[p_ref_0].flags
& ~0xc) | (tensor_blocks[p_ref_0].flags & 0xc) | (tensor_blocks
[p_ref_1].flags & 0xc)))
;
2058 ccv_array_free(tensor_blocks[p_ref_1].head);
2059 if (TENSOR_IS_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags & UNFOLDABLE_AS_INPUT))
2060 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[p_ref_0])(tensor_blocks[p_ref_0].flags = (tensor_blocks[p_ref_0].flags
| UNFOLDABLE_AS_INPUT))
;
2061 // Don't need to check UNFOLDABLE_AS_OUTPUT for p_ref_1 because if it is so, we cannot fold right now.
2062 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[p_ref_1])(tensor_blocks[p_ref_1].flags = ((tensor_blocks[p_ref_1].flags
& ~0x3) | UNASSIGNED))
;
2063 tensor_blocks[p_ref_1].ref = p_ref_0 + 1;
2064 if (!tensor_blocks[p_ref_0].r_refs)
2065 tensor_blocks[p_ref_0].r_refs = ccv_array_new(sizeof(int), 0, 0);
2066 ccv_array_add_unique_int(tensor_blocks[p_ref_0].r_refs, p_ref_1 + 1);
2067 tensor_blocks[p_ref_1].size = 0;
2068 tensor_blocks[p_ref_1].head = 0;
2069 tensor_blocks[p_ref_1].tail = 0;
2070 return 1;
2071 }
2072 return 0;
2073}
2074
2075static void _ccv_nnc_exec_dep_and_tensor_blocks_prep(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int unroll_count, const int* const dup_tensor_block_ref, const int* const dup_tensor_from_ref, const int* const dup_exec_from_ref, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks)
2076{
2077 int i, j, k;
2078 // Generate exec dependencies (or, in other words, partial ordering of executions).
2079 ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(symbolic_graph->exec_symbol_info->rnum, symbolic_graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
2080 int* buf = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * 2);
2081 int buf_size;
2082 if (p_node_info)
2083 { assert(output_size == 0)((void) sizeof ((output_size == 0) ? 1 : 0), __extension__ ({
if (output_size == 0) ; else __assert_fail ("output_size == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2083, __extension__ __PRETTY_FUNCTION__
); }))
; }
2084#define for_block(x, val) \
2085 do { \
2086 if (((int32_t*)val)[0] > 0) \
2087 { \
2088 buf[buf_size * 2] = x; \
2089 buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
2090 ++buf_size; \
2091 } \
2092 } while (0)
2093 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int term __attribute__((unused)) = (visit)->node[_i_
].term; typeof ((exec_symbol_info)) const node __attribute__(
(unused)) = (exec_symbol_info) + idx;
{
2094 buf_size = 0; /* save all its parent deps to this buffer */
2095 ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
2096 if (vector)
2097 CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S
: { do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F:
{ do { int _i_; __attribute__((unused)) const size_t _c_ = (
((exec_dep)->type) & 0xFFF); if ((exec_dep)->type &
CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size
; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_
))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t
) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000
) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) &
-4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for
(_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t
* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_
* _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t
_d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.f64 + (0))); } } } while (0); break; } default: { do
{ int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep
)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR
) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block
((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const
size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size
[(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep
)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_
= (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector
)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_
= (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if
(_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_
= { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->
i), (_d_.u8 + (0))); } } } while (0); } } } while (0)
;
2098 if (!node->outgoings)
2099 continue;
2100 for (i = 0; i < node->outgoings->rnum; i++)
2101 {
2102 int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
2103 const int32_t one = 1;
2104 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
2105 /* If not found, set, if the current node is the destination node, no need
2106 * set itself as parent of subsequent nodes because its terminal nature. */
2107 if (!cell.i32 || cell.i32[0] == 0)
2108 ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
2109 if (buf_size > 0)
2110 {
2111 ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
2112 assert(vector)((void) sizeof ((vector) ? 1 : 0), __extension__ ({ if (vector
) ; else __assert_fail ("vector", "ccv_nnc_symbolic_graph_compile.c"
, 2112, __extension__ __PRETTY_FUNCTION__); }))
;
2113 for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
2114 {
2115 ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
2116 /* If not found, set */
2117 if (!cell.i32 || cell.i32[0] == 0)
2118 ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
2119 else {
2120 /* Otherwise, set to the longest one */
2121 int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1])({ typeof (cell.i32[0]) _a = (cell.i32[0]); typeof (buf[j * 2
+ 1]) _b = (buf[j * 2 + 1]); (_a > _b) ? _a : _b; })
;
2122 ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
2123 }
2124 }
2125 }
2126 }
2127 } ccv_nnc_graph_visit_endfor} }
2128#undef for_block
2129 ccfreefree(buf);
2130 // This struct is allocated earlier to collect information about the tensor's expected start / end execs.
2131 const int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
2132 ccv_nnc_tensor_block_t* tensor_blocks = (ccv_nnc_tensor_block_t*)cccalloccalloc(tensor_block_size, sizeof(ccv_nnc_tensor_block_t));
2133 // The reason is that I need to make everyone of them to be unassigned unless it is used somewhere. It
2134 // happens that I have to loop through all relevant node to find out if one is used or not.
2135 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2136 tensor_blocks[i].flags = UNASSIGNED, tensor_blocks[i].type = tensor_symbol_info[i].info.type, tensor_blocks[i].bypass_ref = tensor_symbol_info[i].bypass_ref;
2137 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2138 for (i = 0; i < node->input_size; i++)
2139 if (node->inputs[i] >= 0)
2140 {
2141 tensor_blocks[node->inputs[i]].flags = 0;
2142 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2143 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2144 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->inputs[i]].type)((tensor_blocks[node->inputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2145 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2146 tensor_blocks[node->inputs[i]].pin_mem = 1;
2147 }
2148 for (i = 0; i < node->output_size; i++)
2149 if (node->outputs[i] >= 0)
2150 {
2151 tensor_blocks[node->outputs[i]].flags = 0;
2152 // If this is a data transfer node, and this is a CPU memory, mark the memory type to be pinned mem.
2153 // This will get propagated back to the buffer, and used there to determine the allocation function to use.
2154 if (CCV_TENSOR_GET_MEMORY(tensor_blocks[node->outputs[i]].type)((tensor_blocks[node->outputs[i]].type) & 0x3) == CCV_TENSOR_CPU_MEMORY &&
2155 (node->cmd.cmd == CCV_NNC_DATA_TRANSFER_FORWARD || node->cmd.cmd == CCV_NNC_DATA_TRANSFER_BACKWARD))
2156 tensor_blocks[node->outputs[i]].pin_mem = 1;
2157 }
2158 } ccv_nnc_graph_visit_endfor} }
2159 if (p_node_info)
2160 {
2161 assert(p_tensor_symbol_info)((void) sizeof ((p_tensor_symbol_info) ? 1 : 0), __extension__
({ if (p_tensor_symbol_info) ; else __assert_fail ("p_tensor_symbol_info"
, "ccv_nnc_symbolic_graph_compile.c", 2161, __extension__ __PRETTY_FUNCTION__
); }))
;
2162 // Mark it as used if it is used in either input or output.
2163 for (i = 0; i < p_node_info->input_size; i++)
2164 if (p_node_info->inputs[i] >= 0)
2165 {
2166 const int d = p_node_info->inputs[i];
2167 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2168 {
2169 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
2170 if (dd >= 0) // If this exists in this sub-graph, great.
2171 tensor_blocks[dd].flags = 0;
2172 }
2173 }
2174 for (i = 0; i < p_node_info->output_size; i++)
2175 if (p_node_info->outputs[i] >= 0)
2176 {
2177 const int d = p_node_info->outputs[i];
2178 if (p_tensor_symbol_info[d].s_ref && p_tensor_symbol_info[d].s_ref->rnum >= symbolic_graph->p_idx)
2179 {
2180 const int dd = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, symbolic_graph->p_idx - 1)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(symbolic_graph->p_idx - 1)))
- 1;
2181 if (dd >= 0) // If this exists in this sub-graph, great.
2182 tensor_blocks[dd].flags = 0;
2183 }
2184 }
2185 }
2186 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2187 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2188 {
2189 // Check no tensor info is auto now.
2190 assert(!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info))((void) sizeof ((!ccv_nnc_is_tensor_auto(tensor_symbol_info[i
].info)) ? 1 : 0), __extension__ ({ if (!ccv_nnc_is_tensor_auto
(tensor_symbol_info[i].info)) ; else __assert_fail ("!ccv_nnc_is_tensor_auto(tensor_symbol_info[i].info)"
, "ccv_nnc_symbolic_graph_compile.c", 2190, __extension__ __PRETTY_FUNCTION__
); }))
;
2191 // If this tensor is used in assign_ref, set it to be un-foldable. (It will be used as parameter,
2192 // therefore, itself life-cycle almost certainly won't concatenate properly with the tensor to
2193 // fold to).
2194 if (tensor_symbol_info[i].assign_ref)
2195 {
2196 // TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i]);
2197 // It can be folded as input (it is fine to be overwritten), but it cannot as output (when folded as input,
2198 // it kept its own representation, which is not the case for output).
2199 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
2200 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2201 // But for where it comes from, it cannot be folded as input, because it cannot be overwritten any time.
2202 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
2203 // It also cannot be folded as output (except i), because we need to keep its own representation.
2204 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[assign_ref])(tensor_blocks[assign_ref].flags = (tensor_blocks[assign_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2205 assert(tensor_blocks[assign_ref].unfoldable_except_ref == 0)((void) sizeof ((tensor_blocks[assign_ref].unfoldable_except_ref
== 0) ? 1 : 0), __extension__ ({ if (tensor_blocks[assign_ref
].unfoldable_except_ref == 0) ; else __assert_fail ("tensor_blocks[assign_ref].unfoldable_except_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2205, __extension__ __PRETTY_FUNCTION__
); }))
;
2206 tensor_blocks[assign_ref].unfoldable_except_ref = i + 1;
2207 for (j = 0; j < unroll_count; j++)
2208 {
2209 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_INPUT))
;
2210 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]].flags
= (tensor_blocks[dup_tensor_block_ref[i * unroll_count + j]]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2211 }
2212 if (tensor_blocks[assign_ref].bypass_ref)
2213 {
2214 // If it contains a bypass_ref, that means we can fold into both the bypass and except_ref, making it untenable.
2215 tensor_blocks[assign_ref].unfoldable_except_ref = 0;
2216 const int bypass_ref = tensor_blocks[assign_ref].bypass_ref - 1;
2217 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_INPUT))
;
2218 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_ref])(tensor_blocks[bypass_ref].flags = (tensor_blocks[bypass_ref]
.flags | UNFOLDABLE_AS_OUTPUT))
;
2219 // On the other hand, it can be folded into the except_ref for the bypass_ref.
2220 tensor_blocks[bypass_ref].unfoldable_except_ref = i + 1;
2221 if (dup_tensor_from_ref)
2222 {
2223 const int bypass_from_ref = dup_tensor_from_ref[bypass_ref];
2224 if (bypass_from_ref >= 0)
2225 {
2226 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_INPUT))
;
2227 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[bypass_from_ref])(tensor_blocks[bypass_from_ref].flags = (tensor_blocks[bypass_from_ref
].flags | UNFOLDABLE_AS_OUTPUT))
;
2228 assert(dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref)((void) sizeof ((dup_tensor_block_ref[bypass_from_ref * unroll_count
+ unroll_count - 1] == bypass_ref) ? 1 : 0), __extension__ (
{ if (dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count
- 1] == bypass_ref) ; else __assert_fail ("dup_tensor_block_ref[bypass_from_ref * unroll_count + unroll_count - 1] == bypass_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2228, __extension__ __PRETTY_FUNCTION__
); }))
;
2229 for (j = 0; j < unroll_count - 1; j++)
2230 {
2231 // Mark every incarnation as unfold-able.
2232 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_INPUT))
;
2233 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count + j]])(tensor_blocks[dup_tensor_block_ref[bypass_from_ref * unroll_count
+ j]].flags = (tensor_blocks[dup_tensor_block_ref[bypass_from_ref
* unroll_count + j]].flags | UNFOLDABLE_AS_OUTPUT))
;
2234 }
2235 }
2236 }
2237 }
2238 }
2239 }
2240 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2241 {
2242 // If it has a pair reference, we don't need to allocate this tensor at all,
2243 // set it to be unassigned.
2244 if (tensor_symbol_info[i].pair_ref)
2245 TENSOR_EXPECT_SET_UNASSIGNED(tensor_blocks[i])(tensor_blocks[i].flags = ((tensor_blocks[i].flags & ~0x3
) | UNASSIGNED))
;
2246 // If it is a tape variable, set it to be un-foldable as too (otherwise we cannot use tape properly).
2247 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) {
2248 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2249 TENSOR_SET_UNFOLDABLE_AS_OUTPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_OUTPUT
))
;
2250 // For this case, there is no exception.
2251 tensor_blocks[i].unfoldable_except_ref = 0;
2252 } else if (tensor_symbol_info[i].p_ref) {
2253 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 2253, __extension__ __PRETTY_FUNCTION__); }))
;
2254 const int p_ref_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(tensor_symbol_info[i].p_ref - 1, p_node_info);
2255 // If I am a case of graph, and this tensor is the input from the parent graph, you cannot fold it as input.
2256 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2257 // TODO: This check can be lifted if we can fold in the parent graph.
2258 if (-1 == p_ref_is_in_or_out)
2259 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2260 if (1 == p_ref_is_in_or_out) // If p_ref is out, it cannot be fold as input.
2261 TENSOR_SET_UNFOLDABLE_AS_INPUT(tensor_blocks[i])(tensor_blocks[i].flags = (tensor_blocks[i].flags | UNFOLDABLE_AS_INPUT
))
;
2262 }
2263 }
2264 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2265 {
2266 if (tensor_symbol_info[i].alias_ref)
2267 {
2268 const int ref = tensor_symbol_info[i].alias_ref - 1;
2269 // If the referenced one is unassigned, mark this as assigned only if current one is assigned.
2270 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED) && !TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED))
2271 tensor_blocks[ref].flags = 0;
2272 // An alias cannot ref to another alias.
2273 assert(!tensor_symbol_info[ref].alias_ref)((void) sizeof ((!tensor_symbol_info[ref].alias_ref) ? 1 : 0)
, __extension__ ({ if (!tensor_symbol_info[ref].alias_ref) ; else
__assert_fail ("!tensor_symbol_info[ref].alias_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2273, __extension__ __PRETTY_FUNCTION__); }))
;
2274 tensor_blocks[i].flags = ALIAS;
2275 tensor_blocks[i].ref = ref + 1; // Assign the ref.
2276 if (!tensor_blocks[ref].r_refs)
2277 tensor_blocks[ref].r_refs = ccv_array_new(sizeof(int), 0, 0);
2278 ccv_array_add_unique_int(tensor_blocks[ref].r_refs, i + 1);
2279 }
2280 }
2281 // Scan again and if the ref is not assigned, mark the alias not assigned.
2282 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2283 if (TENSOR_EXPECT_ALIAS(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == ALIAS))
2284 {
2285 const int ref = tensor_blocks[i].ref - 1;
2286 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[ref])((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
2287 {
2288 // Mark this as unassigned.
2289 tensor_blocks[i].flags = UNASSIGNED;
2290 tensor_blocks[i].ref = 0;
2291 }
2292 }
2293 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2294 {
2295 // If this tensor is not expected to be unassigned, allocate the arrays for s and t.
2296 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
)
2297 {
2298 tensor_blocks[i].head = ccv_array_new(sizeof(int), 0, 0);
2299 tensor_blocks[i].tail = ccv_array_new(sizeof(int), 0, 0);
2300 // Cache tensor size (align to 16 bytes).
2301 tensor_blocks[i].size = (uint64_t)ccv_nnc_tensor_data_size(tensor_symbol_info[i].info);
2302 }
2303 // If there is a p_ref, add the one to the p_refs list.
2304 if (tensor_symbol_info[i].p_ref)
2305 tensor_blocks[i].p_refs[0] = tensor_symbol_info[i].p_ref;
2306 }
2307 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2308 for (i = 0; i < node->input_size; i++)
2309 {
2310 int d = node->inputs[i];
2311 if (d < 0)
2312 continue;
2313 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2314 d = tensor_symbol_info[d].alias_ref - 1;
2315 tensor_blocks[d].flags |= READ_ONLY;
2316 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2317 continue;
2318 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2318, __extension__ __PRETTY_FUNCTION__
); }))
;
2319 /* If this is first encounter, its head starts (this tensor is init'ed outside of the graph
2320 * from the very beginning of the graph life-cycle and ends here. */
2321 if (tensor_blocks[d].head->rnum == 0 && !TENSOR_REQUIRE_INIT(tensor_symbol_info[d].flags)(((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[d].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
2322 {
2323 for (j = 0; j < source_size; j++)
2324 {
2325 // If the source is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2326 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, sources[j].d);
2327 if (cell.i32 && cell.i32[0] > 0)
2328 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[d]);
2329 }
2330 /* If this is a read-only (based on SSA, if first encountered as read), and this is
2331 * sub-graph (TODO: this condition can be lifted for case..of that is never in a while
2332 * loop, however, in that case, you need to prevent read-only gets reused for the
2333 * output tensor, which is not obvious how to implement correctly), and it is not
2334 * assign_ref from anywhere (not a parameterized loop). We cannot reuse this region
2335 * of memory anyway (because on second loop, we want to read the same value out).
2336 * Mark it to the end of the graph. */
2337 if (p_node_info && !tensor_symbol_info[d].assign_ref)
2338 for (j = 0; j < destination_size; j++)
2339 {
2340 // If the destination is connecting to current node, add (otherwise we will create tensor blocks that used in other streams, which is unneccessary).
2341 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2342 if (cell.i32 && cell.i32[0] > 0)
2343 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2344 }
2345 }
2346 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2347 }
2348 for (i = 0; i < node->output_size; i++)
2349 {
2350 int d = node->outputs[i];
2351 if (d < 0)
2352 continue;
2353 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2354 d = tensor_symbol_info[d].alias_ref - 1;
2355 tensor_blocks[d].flags |= WRITE_ONLY;
2356 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2357 continue;
2358 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2358, __extension__ __PRETTY_FUNCTION__
); }))
;
2359 _ccv_nnc_tensor_block_add_exec(exec_dep, idx, tensor_blocks[d]);
2360 }
2361 } ccv_nnc_graph_visit_endfor} }
2362 // For any assign_ref, its life-time kept until the end and wrap over.
2363 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2364 // If this tensor is not unassigned (or alias) and it is assigned from somewhere else,
2365 // that "somewhere else" need to keep its life-time til the end.
2366 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&&
2367 p_node_info && tensor_symbol_info[i].assign_ref)
2368 {
2369 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2370 for (j = 0; j < destination_size; j++)
2371 {
2372 // This logic is to be more conservative about which destination we add to.
2373 // As of now, if we add everything, it is fine most likely. However, it may
2374 // cause issues in the future to do so naively. Thus, instead, we only add
2375 // the destination to it iff either the tensor is not used at all, or, the
2376 // destination is on the same stream as of the tensor block some way.
2377 int flag = !tensor_blocks[assign_ref].tail;
2378 for (k = 0; !flag && k < tensor_blocks[assign_ref].tail->rnum; k++)
2379 {
2380 const int idx = *(int*)ccv_array_get(tensor_blocks[assign_ref].tail, k)((void*)(((char*)((tensor_blocks[assign_ref].tail)->data))
+ (size_t)(tensor_blocks[assign_ref].tail)->rsize * (size_t
)(k)))
;
2381 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2382 flag = (cell.i32 && cell.i32[0] > 0);
2383 }
2384 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2385 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[assign_ref]);
2386 }
2387 }
2388 for (i = 0; i < output_size; i++)
2389 {
2390 assert(outputs[i].graph == symbolic_graph)((void) sizeof ((outputs[i].graph == symbolic_graph) ? 1 : 0)
, __extension__ ({ if (outputs[i].graph == symbolic_graph) ; else
__assert_fail ("outputs[i].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 2390, __extension__ __PRETTY_FUNCTION__); }))
;
2391 int d = outputs[i].d;
2392 if (d < 0)
2393 continue;
2394 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2395 d = tensor_symbol_info[d].alias_ref - 1;
2396 if (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED))
2397 continue;
2398 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 2398, __extension__ __PRETTY_FUNCTION__
); }))
;
2399 for (j = 0; j < destination_size; j++)
2400 {
2401 int flag = !tensor_blocks[d].tail;
2402 for (k = 0; !flag && k < tensor_blocks[d].tail->rnum; k++)
2403 {
2404 const int idx = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
2405 const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, destinations[j].d, idx);
2406 flag = (cell.i32 && cell.i32[0] > 0);
2407 }
2408 if (flag) // If there is no tail at all, add it. Otherwise, only add it if the destination is on the same stream with this tensor block somehow.
2409 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[d]);
2410 }
2411 }
2412 // Enforce tensor reuse by collapse tensors for in-place operations. We will fault if this cannot be done.
2413 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2414 int x, y;
2415 for (x = 0; x < node->input_size; x++)
2416 for (y = 0; y < node->output_size; y++)
2417 /* Some operations enforces some tensors to be the same for inputs / outputs. */
2418 if (ccv_nnc_cmd_enforce_inplace(node->cmd, x, node->input_size, y, node->output_size))
2419 {
2420 // If both unassigned, it is fine.
2421 if (node->inputs[x] < 0 && node->outputs[y] < 0)
2422 continue;
2423 int ref = node->inputs[x];
2424 assert(ref >= 0)((void) sizeof ((ref >= 0) ? 1 : 0), __extension__ ({ if (
ref >= 0) ; else __assert_fail ("ref >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 2424, __extension__ __PRETTY_FUNCTION__); }))
;
2425 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2426 ref = tensor_blocks[ref].ref - 1;
2427 const int node_output_y = node->outputs[y];
2428 assert(node_output_y >= 0)((void) sizeof ((node_output_y >= 0) ? 1 : 0), __extension__
({ if (node_output_y >= 0) ; else __assert_fail ("node_output_y >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2428, __extension__ __PRETTY_FUNCTION__
); }))
;
2429 // If both are not computable, it is fine, we don't need to enforce.
2430 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2431 !TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node_output_y])(!((tensor_blocks[node_output_y].flags & 0x3) == ALIAS) &&
!((tensor_blocks[node_output_y].flags & 0x3) == UNASSIGNED
))
)
2432 continue;
2433 // Otherwise, enforce and error out if failed.
2434 if (!_ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y))
2435 { assert(0 && "cannot enforce inplace for the two tensors")((void) sizeof ((0 && "cannot enforce inplace for the two tensors"
) ? 1 : 0), __extension__ ({ if (0 && "cannot enforce inplace for the two tensors"
) ; else __assert_fail ("0 && \"cannot enforce inplace for the two tensors\""
, "ccv_nnc_symbolic_graph_compile.c", 2435, __extension__ __PRETTY_FUNCTION__
); }))
; }
2436 }
2437 } ccv_nnc_graph_visit_endfor} }
2438 // Ignore tensors that are already binded, no matter if it is used or not. Doing it here because
2439 // we need to make sure enforced tensors are properly assigned, so that we don't bind on a tensor
2440 // that is not enforced in-place (because the tensor enforced in-place will be different than the
2441 // binding one).
2442 for (i = 0; i < tensor_bind_size; i++)
2443 {
2444 const ccv_nnc_tensor_symbol_t resolved_symbol = ccv_nnc_tensor_symbol_resolve(symbolic_graph, tensor_binds[i].symbol);
2445 // If there is a tensor binded, then it is unassigned.
2446 if (resolved_symbol.d >= 0)
2447 {
2448 int d = resolved_symbol.d;
2449 // I cannot assert too much at this moment.
2450 if (TENSOR_EXPECT_ALIAS(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == ALIAS))
2451 d = tensor_symbol_info[d].alias_ref - 1; // Bind back to the original.
2452 // This check is for in-place ops. Only in-place op could have unassigned but ref.
2453 // It has nothing to do with alias.
2454 while (TENSOR_EXPECT_UNASSIGNED(tensor_blocks[d])((tensor_blocks[d].flags & 0x3) == UNASSIGNED) && tensor_blocks[d].ref)
2455 d = tensor_blocks[d].ref - 1;
2456 // Doesn't work if this is a loop carrying variable.
2457 assert(!tensor_symbol_info[d].assign_ref)((void) sizeof ((!tensor_symbol_info[d].assign_ref) ? 1 : 0),
__extension__ ({ if (!tensor_symbol_info[d].assign_ref) ; else
__assert_fail ("!tensor_symbol_info[d].assign_ref", "ccv_nnc_symbolic_graph_compile.c"
, 2457, __extension__ __PRETTY_FUNCTION__); }))
;
2458 tensor_blocks[d].flags = UNASSIGNED;
2459 tensor_blocks[d].ref = 0; // No need to have ref as well.
2460 }
2461 }
2462 // Maximum tensor reuse by collapse tensors allows in-place operations (and it matches the start, end tensor).
2463 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2464 int x, y;
2465 for (x = 0; x < node->input_size; x++)
2466 {
2467 /* If the input is not assigned, it can be referenced, find the referenced one */
2468 int ref = node->inputs[x];
2469 if (ref < 0)
2470 continue;
2471 const ccv_nnc_tensor_symbol_info_t x_symbol = tensor_symbol_info[ref];
2472 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
2473 ref = tensor_blocks[ref].ref - 1;
2474 assert(tensor_blocks[ref].ref == 0)((void) sizeof ((tensor_blocks[ref].ref == 0) ? 1 : 0), __extension__
({ if (tensor_blocks[ref].ref == 0) ; else __assert_fail ("tensor_blocks[ref].ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2474, __extension__ __PRETTY_FUNCTION__
); }))
;
2475 if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&&
2476 tensor_blocks[ref].tail->rnum == 1)
2477 {
2478 for (y = 0; y < node->output_size; y++)
2479 /* Only proceed if the input symbol is different from the output symbol, */
2480 /* and the input symbol meets the output symbol exactly at the same spot. */
2481 if (ccv_nnc_cmd_allow_inplace(node->cmd, x, node->input_size, y, node->output_size) &&
2482 node->outputs[y] >= 0 &&
2483 ref != node->outputs[y] &&
2484 TENSOR_EXPECT_COMPUTABLE(tensor_blocks[node->outputs[y]])(!((tensor_blocks[node->outputs[y]].flags & 0x3) == ALIAS
) && !((tensor_blocks[node->outputs[y]].flags &
0x3) == UNASSIGNED))
)
2485 {
2486 const int node_output_y = node->outputs[y];
2487 const ccv_nnc_tensor_symbol_info_t y_symbol = tensor_symbol_info[node_output_y];
2488 /* If dimension matches perfectly, then we can assign y_symbol to x.
2489 * If both of them are aliases, making sure their origin matches in size too. */
2490 if (memcmp(x_symbol.info.dim, y_symbol.info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
2491 {
2492 _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, ref, node_output_y);
2493 // This refers to an alias itself, now mark it and will be processed later.
2494 if (ref != node->inputs[x])
2495 tensor_blocks[node_output_y].alias_ref = node->inputs[x] + 1;
2496 }
2497 }
2498 }
2499 }
2500 } ccv_nnc_graph_visit_endfor} }
2501 // Specifically handle the bypass. This need to be done after the first pass.
2502 // I need to extend the bypass life-time to the same as the one I am going with.
2503 // It is important we visit these nodes and assign bypass_ref to its dependents in topological order.
2504 ccv_nnc_tensor_block_t empty_block = {};
2505 empty_block.head = ccv_array_new(sizeof(int), 0, 0);
2506 empty_block.tail = ccv_array_new(sizeof(int), 0, 0);
2507 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2508 if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
2509 {
2510 int can_bypass = 1;
2511 for (i = 0; can_bypass && i < node->output_size; i++)
2512 {
2513 int d = node->outputs[i];
2514 if (d < 0)
2515 continue;
2516 if (!tensor_blocks[d].bypass_ref)
2517 continue;
2518 while (tensor_blocks[d].ref)
2519 d = tensor_blocks[d].ref - 1;
2520 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2521 while (tensor_blocks[bypass_ref].ref)
2522 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2523 // If this doesn't participate in the while loop, we don't need to check the while loop constraint.
2524 if (!tensor_symbol_info[bypass_ref].assign_ref && !tensor_symbol_info[bypass_ref].r_assign_ref)
2525 continue;
2526 ccv_array_clear(empty_block.head);
2527 for (j = 0; tensor_blocks[bypass_ref].head && j < tensor_blocks[bypass_ref].head->rnum; j++)
2528 ccv_array_push(empty_block.head, ccv_array_get(tensor_blocks[bypass_ref].head, j)((void*)(((char*)((tensor_blocks[bypass_ref].head)->data))
+ (size_t)(tensor_blocks[bypass_ref].head)->rsize * (size_t
)(j)))
);
2529 ccv_array_clear(empty_block.tail);
2530 for (j = 0; tensor_blocks[bypass_ref].tail && j < tensor_blocks[bypass_ref].tail->rnum; j++)
2531 ccv_array_push(empty_block.tail, ccv_array_get(tensor_blocks[bypass_ref].tail, j)((void*)(((char*)((tensor_blocks[bypass_ref].tail)->data))
+ (size_t)(tensor_blocks[bypass_ref].tail)->rsize * (size_t
)(j)))
);
2532 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2533 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, empty_block);
2534 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2535 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, empty_block);
2536 // It can only be unfoldable due to while constraint. Check whether this satisfies the while loop constraint.
2537 assert(!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref))((void) sizeof ((!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ? 1 : 0), __extension__
({ if (!(tensor_symbol_info[bypass_ref].assign_ref &&
tensor_symbol_info[bypass_ref].r_assign_ref)) ; else __assert_fail
("!(tensor_symbol_info[bypass_ref].assign_ref && tensor_symbol_info[bypass_ref].r_assign_ref)"
, "ccv_nnc_symbolic_graph_compile.c", 2537, __extension__ __PRETTY_FUNCTION__
); }))
;
2538 int b_ref = (tensor_symbol_info[bypass_ref].assign_ref) ? tensor_symbol_info[bypass_ref].assign_ref - 1 : tensor_symbol_info[bypass_ref].r_assign_ref - 1;
2539 while (tensor_blocks[b_ref].ref)
2540 b_ref = tensor_blocks[b_ref].ref - 1;
2541 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, empty_block, tensor_blocks[b_ref]);
2542 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], empty_block);
2543 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere)
2544 // even after we extend the life-time of bypass_ref. Then we are in a good shape.
2545 can_bypass = can_bypass && (a_hop_b || b_hop_a);
2546 }
2547 if (can_bypass)
2548 {
2549 for (i = 0; i < node->output_size; i++)
2550 {
2551 int d = node->outputs[i];
2552 if (d < 0)
2553 continue;
2554 if (!tensor_blocks[d].bypass_ref)
2555 continue;
2556 while (tensor_blocks[d].ref)
2557 d = tensor_blocks[d].ref - 1;
2558 int bypass_ref = tensor_blocks[node->outputs[i]].bypass_ref - 1;
2559 while (tensor_blocks[bypass_ref].ref)
2560 bypass_ref = tensor_blocks[bypass_ref].ref - 1;
2561 // The bypass_ref can extend its life-time.
2562 for (j = 0; tensor_blocks[d].head && j < tensor_blocks[d].head->rnum; j++)
2563 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].head, j)((void*)(((char*)((tensor_blocks[d].head)->data)) + (size_t
)(tensor_blocks[d].head)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2564 for (j = 0; tensor_blocks[d].tail && j < tensor_blocks[d].tail->rnum; j++)
2565 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[d].tail, j)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(j)))
, tensor_blocks[bypass_ref]);
2566 }
2567 } else {
2568 for (i = 0; i < node->output_size; i++)
2569 tensor_blocks[node->outputs[i]].bypass_ref = 0;
2570 const int exec_idx = (dup_exec_from_ref) ? dup_exec_from_ref[idx] : idx;
2571 // Mark this exec as no bypass IO (thus, I need to insert explicit data transfer.
2572 exec_flags[exec_idx].flags |= CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO;
2573 }
2574 }
2575 } ccv_nnc_graph_visit_endfor} }
2576 ccv_array_free(empty_block.head);
2577 ccv_array_free(empty_block.tail);
2578 *r_exec_dep = exec_dep;
2579 *r_tensor_blocks = tensor_blocks;
2580}
2581
2582static ccv_nnc_cmd_t _ccv_nnc_subst_sub_graph_with_noop(const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd)
2583{
2584 if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
2585 {
2586 ccv_nnc_cmd_t retval = cmd;
2587 retval.cmd = CCV_NNC_NOOP;
2588 return retval;
2589 }
2590 return cmd;
2591}
2592
2593static ccv_nnc_tensor_symbol_t _ccv_nnc_dup_tensor_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int input)
2594{
2595 if (dup_tensor_block_ref[input * unroll_count] < 0) // No tensor ref, create one.
2596 {
2597 if (tensor_symbol_info[input].alias_ref)
2598 {
2599 const int alias_ref = tensor_symbol_info[input].alias_ref - 1;
2600 assert(tensor_symbol_info[alias_ref].alias_ref == 0)((void) sizeof ((tensor_symbol_info[alias_ref].alias_ref == 0
) ? 1 : 0), __extension__ ({ if (tensor_symbol_info[alias_ref
].alias_ref == 0) ; else __assert_fail ("tensor_symbol_info[alias_ref].alias_ref == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2600, __extension__ __PRETTY_FUNCTION__
); }))
;
2601 ccv_nnc_tensor_symbol_t tensor_symbol = {};
2602 if (dup_tensor_block_ref[alias_ref * unroll_count] < 0)
2603 {
2604 tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[alias_ref].info, 0);
2605 if (tensor_symbol_info[alias_ref].pair_ref)
2606 ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2607 .d = tensor_symbol_info[alias_ref].pair_ref - 1,
2608 .graph = dup_graph->pair
2609 });
2610 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[alias_ref].flags);
2611 dup_tensor_block_ref[alias_ref * unroll_count] = tensor_symbol.d;
2612 } else {
2613 tensor_symbol.d = dup_tensor_block_ref[alias_ref * unroll_count];
2614 tensor_symbol.graph = dup_graph;
2615 }
2616 ccv_nnc_tensor_symbol_t alias_symbol = ccv_nnc_tensor_symbol_alias_new(dup_graph, tensor_symbol, tensor_symbol_info[input].ofs, tensor_symbol_info[input].stride, tensor_symbol_info[input].info, 0);
2617 if (tensor_symbol_info[input].pair_ref)
2618 ccv_nnc_tensor_symbol_pair_with(dup_graph, alias_symbol, (ccv_nnc_tensor_symbol_t){
2619 .d = tensor_symbol_info[input].pair_ref - 1,
2620 .graph = dup_graph->pair
2621 });
2622 ccv_nnc_tensor_symbol_set_flags(dup_graph, alias_symbol, tensor_symbol_info[input].flags);
2623 dup_tensor_block_ref[input * unroll_count] = alias_symbol.d;
2624 } else {
2625 ccv_nnc_tensor_symbol_t tensor_symbol = ccv_nnc_tensor_symbol_new(dup_graph, tensor_symbol_info[input].info, 0);
2626 if (tensor_symbol_info[input].pair_ref)
2627 ccv_nnc_tensor_symbol_pair_with(dup_graph, tensor_symbol, (ccv_nnc_tensor_symbol_t){
2628 .d = tensor_symbol_info[input].pair_ref - 1,
2629 .graph = dup_graph->pair
2630 });
2631 ccv_nnc_tensor_symbol_set_flags(dup_graph, tensor_symbol, tensor_symbol_info[input].flags);
2632 dup_tensor_block_ref[input * unroll_count] = tensor_symbol.d;
2633 }
2634 if (tensor_symbol_info[input].bypass_ref)
2635 {
2636 const int dup_bypass_ref = dup_tensor_block_ref[(tensor_symbol_info[input].bypass_ref - 1) * unroll_count];
2637 assert(dup_bypass_ref >= 0)((void) sizeof ((dup_bypass_ref >= 0) ? 1 : 0), __extension__
({ if (dup_bypass_ref >= 0) ; else __assert_fail ("dup_bypass_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 2637, __extension__ __PRETTY_FUNCTION__
); }))
;
2638 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(dup_graph->tensor_symbol_info, dup_tensor_block_ref[input * unroll_count])((void*)(((char*)((dup_graph->tensor_symbol_info)->data
)) + (size_t)(dup_graph->tensor_symbol_info)->rsize * (
size_t)(dup_tensor_block_ref[input * unroll_count])))
;
2639 symbol_info->bypass_ref = dup_bypass_ref + 1;
2640 }
2641 }
2642 return (ccv_nnc_tensor_symbol_t) {
2643 .d = dup_tensor_block_ref[input * unroll_count],
2644 .graph = dup_graph,
2645 };
2646}
2647
2648static ccv_nnc_graph_exec_symbol_t _ccv_nnc_dup_graph_exec_symbol(ccv_nnc_symbolic_graph_t* const dup_graph, const int unroll_count, int* const dup_exec_ref, int* const dup_tensor_block_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_info_t* const node, const int idx, ccv_nnc_tensor_symbol_t* const max_inputs, ccv_nnc_tensor_symbol_t* const max_outputs)
2649{
2650 int i;
2651 if (dup_exec_ref[idx * unroll_count] < 0)
2652 {
2653 // Input has to come before output, because output could has a bypass reference to the input.
2654 for (i = 0; i < node->input_size; i++)
2655 max_inputs[i] = (node->inputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->inputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->inputs[i], .graph = dup_graph };
2656 for (i = 0; i < node->output_size; i++)
2657 max_outputs[i] = (node->outputs[i] >= 0) ? _ccv_nnc_dup_tensor_symbol(dup_graph, unroll_count, dup_tensor_block_ref, tensor_symbol_info, node->outputs[i]) : (ccv_nnc_tensor_symbol_t){ .d = node->outputs[i], .graph = dup_graph };
2658 ccv_nnc_graph_exec_symbol_t exec_symbol = ccv_nnc_graph_exec_symbol_new(dup_graph, node->cmd, max_inputs, node->input_size, max_outputs, node->output_size, 0);
2659 dup_exec_ref[idx * unroll_count] = exec_symbol.d;
2660 }
2661 return (ccv_nnc_graph_exec_symbol_t) {
2662 .d = dup_exec_ref[idx * unroll_count],
2663 .graph = dup_graph,
2664 };
2665}
2666
2667static void _ccv_nnc_tensor_blocks_free(ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size)
2668{
2669 int i;
2670 for (i = 0; i < tensor_block_size; i++)
2671 {
2672 if (tensor_blocks[i].head)
2673 ccv_array_free(tensor_blocks[i].head);
2674 if (tensor_blocks[i].tail)
2675 ccv_array_free(tensor_blocks[i].tail);
2676 if (tensor_blocks[i].r_refs)
2677 ccv_array_free(tensor_blocks[i].r_refs);
2678 if (tensor_blocks[i].dup_p_refs)
2679 ccv_array_free(tensor_blocks[i].dup_p_refs);
2680 }
2681 ccfreefree(tensor_blocks);
2682}
2683
2684// Find tensors that cannot be solved by co-allocating to the same location.
2685static int _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks)
2686{
2687 int i, j, unroll_count = 0;
2688 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2689 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && tensor_symbol_info[i].assign_ref)
2690 {
2691 // This is is a parameter, thus, it has to be either an alias or used.
2692 assert(tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i]))((void) sizeof ((tensor_blocks[i].ref || ((tensor_blocks[i].flags
& 0x3) == 0)) ? 1 : 0), __extension__ ({ if (tensor_blocks
[i].ref || ((tensor_blocks[i].flags & 0x3) == 0)) ; else __assert_fail
("tensor_blocks[i].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[i])"
, "ccv_nnc_symbolic_graph_compile.c", 2692, __extension__ __PRETTY_FUNCTION__
); }))
;
2693 const int assign_ref = tensor_symbol_info[i].assign_ref - 1; // Starts at 1.
2694 // The parameter it assign to has to be either an alias or used.
2695 assert(tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref]))((void) sizeof ((tensor_blocks[assign_ref].ref || ((tensor_blocks
[assign_ref].flags & 0x3) == 0)) ? 1 : 0), __extension__ (
{ if (tensor_blocks[assign_ref].ref || ((tensor_blocks[assign_ref
].flags & 0x3) == 0)) ; else __assert_fail ("tensor_blocks[assign_ref].ref || TENSOR_EXPECT_ORDINARY(tensor_blocks[assign_ref])"
, "ccv_nnc_symbolic_graph_compile.c", 2695, __extension__ __PRETTY_FUNCTION__
); }))
;
2696 // If any of this two (assigner and assignee) is an alias, check to see if they are the same.
2697 // If it is the same, we are good, no need to extend.
2698 int a_ref = i;
2699 while (tensor_blocks[a_ref].ref)
2700 a_ref = tensor_blocks[a_ref].ref - 1;
2701 int b_ref = assign_ref;
2702 while (tensor_blocks[b_ref].ref)
2703 b_ref = tensor_blocks[b_ref].ref - 1;
2704 if (a_ref != b_ref)
2705 {
2706 // If any of the b's head is deterministically later than a's tail
2707 // or any of the b's tail is deterministically earlier than a's head, they don't interfere.
2708 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[a_ref]);
2709 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[a_ref], tensor_blocks[b_ref]);
2710 // It cannot be that both i can hop to j can j can hop to i.
2711 assert(!(a_hop_b > 0 && b_hop_a > 0))((void) sizeof ((!(a_hop_b > 0 && b_hop_a > 0))
? 1 : 0), __extension__ ({ if (!(a_hop_b > 0 && b_hop_a
> 0)) ; else __assert_fail ("!(a_hop_b > 0 && b_hop_a > 0)"
, "ccv_nnc_symbolic_graph_compile.c", 2711, __extension__ __PRETTY_FUNCTION__
); }))
;
2712 // Can it be folded
2713 // These two can be assigned to the same region of memory without issue (because their life-time doesn't interfere).
2714 if (a_hop_b || b_hop_a)
2715 {
2716 tensor_blocks[a_ref].companion_ref = b_ref + 1;
2717 tensor_blocks[b_ref].companion_ref = a_ref + 1;
2718 continue;
2719 }
2720 int c_ref = tensor_symbol_info[b_ref].assign_ref - 1;
2721 for (j = 0; c_ref >= 0; j++)
2722 {
2723 while (tensor_blocks[c_ref].ref)
2724 c_ref = tensor_blocks[c_ref].ref - 1;
2725 c_ref = tensor_symbol_info[c_ref].assign_ref - 1;
2726 }
2727 unroll_count = ccv_max(unroll_count, j + 1)({ typeof (unroll_count) _a = (unroll_count); typeof (j + 1) _b
= (j + 1); (_a > _b) ? _a : _b; })
;
2728 }
2729 }
2730 // Reset companion_ref if need to unroll.
2731 if (unroll_count)
2732 for (j = 0; j < symbolic_graph->tensor_symbol_info->rnum; j++)
2733 tensor_blocks[j].companion_ref = 0;
2734 return unroll_count;
2735}
2736
2737static void _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_visit_t* const visit, const int unroll_count, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_sparse_matrix_t* const exec_dep, const ccv_nnc_tensor_block_t* const tensor_blocks, ccv_nnc_symbolic_graph_t* const dup_graph, int* const r_dup_tensor_block_ref, int* const r_dup_exec_ref)
2738{
2739 int i, j, n;
2740 // The inout exec nodes, these are the nodes we are going to extend.
2741 uint8_t* inout = (uint8_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(uint8_t));
2742 int max_input_size = 0;
2743 int max_output_size = 0;
2744 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2745 {
2746 max_input_size = ccv_max(exec_symbol_info[i].input_size, max_input_size)({ typeof (exec_symbol_info[i].input_size) _a = (exec_symbol_info
[i].input_size); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
;
2747 max_output_size = ccv_max(exec_symbol_info[i].output_size, max_output_size)({ typeof (exec_symbol_info[i].output_size) _a = (exec_symbol_info
[i].output_size); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
;
2748 }
2749 ccv_nnc_tensor_symbol_t max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
2750 ccv_nnc_tensor_symbol_t max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
2751 // Doing graph expansion
2752 // It goes without saying, we must have more than one tensors / execs (otherwise I cannot use 0 as no exec ref).
2753 assert(dup_graph->exec_symbol_info->rnum > 0)((void) sizeof ((dup_graph->exec_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->exec_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->exec_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2753, __extension__ __PRETTY_FUNCTION__
); }))
;
2754 assert(dup_graph->tensor_symbol_info->rnum > 0)((void) sizeof ((dup_graph->tensor_symbol_info->rnum >
0) ? 1 : 0), __extension__ ({ if (dup_graph->tensor_symbol_info
->rnum > 0) ; else __assert_fail ("dup_graph->tensor_symbol_info->rnum > 0"
, "ccv_nnc_symbolic_graph_compile.c", 2754, __extension__ __PRETTY_FUNCTION__
); }))
;
2755#define INCOMING_NODE (1)
2756#define OUTGOING_NODE (2)
2757 // Unroll the graph n times.
2758 for (n = 0; n < unroll_count; n++)
2759 {
2760 int* const dup_exec_ref = r_dup_exec_ref + n;
2761 const int* const prev_dup_tensor_block_ref = n > 0 ? r_dup_tensor_block_ref + (n - 1) : 0;
2762 int* const dup_tensor_block_ref = r_dup_tensor_block_ref + n;
2763 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2764 dup_exec_ref[i * unroll_count] = -1;
2765 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2766 {
2767 // If there is a assign_ref, that means I don't need to dup the tensor.
2768 if (tensor_symbol_info[i].assign_ref)
2769 {
2770 const int assign_ref = tensor_symbol_info[i].assign_ref - 1;
2771 dup_tensor_block_ref[i * unroll_count] = prev_dup_tensor_block_ref ? prev_dup_tensor_block_ref[assign_ref * unroll_count] : assign_ref;
2772 } else if (TENSOR_EXPECT_COMPUTABLE(tensor_blocks[i])(!((tensor_blocks[i].flags & 0x3) == ALIAS) && !(
(tensor_blocks[i].flags & 0x3) == UNASSIGNED))
&& TENSOR_READ_WRITE(tensor_blocks[i])(tensor_blocks[i].flags & 0xc) == READ_ONLY)
2773 // If this is a read-only tensor block, no need to duplicate because the value never changes
2774 // (note we handled assign_ref first), therefore, no need to generate duplicate.
2775 dup_tensor_block_ref[i * unroll_count] = i;
2776 else
2777 dup_tensor_block_ref[i * unroll_count] = -1;
2778 }
2779 // Go through the original graph, make copies of the node if it is inout.
2780 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
2781 ccv_nnc_graph_exec_symbol_t exec_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, node, idx, max_inputs, max_outputs);
2782 inout[idx] |= INCOMING_NODE; /* Mark this node as incoming. */
2783 if (!node->outgoings)
2784 continue;
2785 for (i = 0; i < node->outgoings->rnum; i++)
2786 {
2787 const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
2788 inout[outgoing_idx] |= OUTGOING_NODE; /* Mark this node as outgoing. */
2789 ccv_nnc_graph_exec_symbol_t outgoing_symbol = _ccv_nnc_dup_graph_exec_symbol(dup_graph, unroll_count, dup_exec_ref, dup_tensor_block_ref, tensor_symbol_info, exec_symbol_info + outgoing_idx, outgoing_idx, max_inputs, max_outputs);
2790 ccv_nnc_graph_exec_symbol_concat(dup_graph, exec_symbol, outgoing_symbol);
2791 }
2792 } ccv_nnc_graph_visit_endfor} }
2793 // Check the visitor are all marked as either incoming or outgoing.
2794 const ccv_nnc_graph_exec_symbol_t* const dup_destinations = ccv_nnc_symbolic_graph_destinations(dup_graph);
2795 const int dup_destination_size = ccv_nnc_symbolic_graph_destination_size(dup_graph);
2796 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2797 {
2798 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2799 continue;
2800 assert((inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE))((void) sizeof (((inout[i] & INCOMING_NODE) || (inout[i] &
OUTGOING_NODE)) ? 1 : 0), __extension__ ({ if ((inout[i] &
INCOMING_NODE) || (inout[i] & OUTGOING_NODE)) ; else __assert_fail
("(inout[i] & INCOMING_NODE) || (inout[i] & OUTGOING_NODE)"
, "ccv_nnc_symbolic_graph_compile.c", 2800, __extension__ __PRETTY_FUNCTION__
); }))
;
2801 // If this is pure incoming nodes, then I need to concat this one with all original destination node
2802 if (inout[i] == INCOMING_NODE)
2803 for (j = 0; j < dup_destination_size; j++)
2804 {
2805 ccv_nnc_graph_exec_symbol_concat(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2806 .d = dup_destinations[j].d,
2807 .graph = dup_graph,
2808 }, (ccv_nnc_graph_exec_symbol_t) {
2809 .d = dup_exec_ref[i * unroll_count],
2810 .graph = dup_graph,
2811 });
2812 }
2813 }
2814 if (dup_graph->destinations)
2815 ccv_array_clear(dup_graph->destinations);
2816 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2817 {
2818 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2819 continue;
2820 const int d = dup_exec_ref[i * unroll_count];
2821 ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, d)((void*)(((char*)((dup_graph->exec_symbol_info)->data))
+ (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(d)))
;
2822 // If this has no outgoing node, add to the destination.
2823 if (!exec_symbol_info->outgoings || exec_symbol_info->outgoings->rnum == 0)
2824 ccv_nnc_symbolic_graph_add_destination(dup_graph, (ccv_nnc_graph_exec_symbol_t) {
2825 .graph = dup_graph,
2826 .d = d,
2827 });
2828 }
2829 }
2830#undef INCOMING_NODE
2831#undef OUTGOING_NODE
2832 ccfreefree(inout);
2833}
2834
2835static void _ccv_nnc_fixup_assign_ref_after_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const int unroll_count, const ccv_nnc_tensor_block_t* const tensor_blocks, const int* const dup_tensor_block_ref, ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info)
2836{
2837 int i;
2838 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++) // symbolic graph is the old graph and tensor blocks is the old tensor blocks.
2839 // Now can assign them (The dup) as companion.
2840 // Get to the last one, which we will wrap over.
2841 if (dup_tensor_symbol_info[i].assign_ref)
2842 {
2843 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = 0;
2844 dup_tensor_symbol_info[i].assign_ref = dup_tensor_block_ref[(dup_tensor_symbol_info[i].assign_ref - 1) * unroll_count + unroll_count - 1] + 1;
2845 assert(dup_tensor_symbol_info[i].assign_ref)((void) sizeof ((dup_tensor_symbol_info[i].assign_ref) ? 1 : 0
), __extension__ ({ if (dup_tensor_symbol_info[i].assign_ref)
; else __assert_fail ("dup_tensor_symbol_info[i].assign_ref"
, "ccv_nnc_symbolic_graph_compile.c", 2845, __extension__ __PRETTY_FUNCTION__
); }))
;
2846 dup_tensor_symbol_info[dup_tensor_symbol_info[i].assign_ref - 1].r_assign_ref = i + 1;
2847 }
2848}
2849
2850// If the tensor blocks are the outputs of this graph, its life-time should be extended to the end of this graph.
2851// However, it is not that simple if the graph is unrolled. For unrolled graph, it needs to reach the end of
2852// the "original" graph and all its duplicated ends (for their duplicated tensor blocks).
2853static void _ccv_nnc_fixup_tensor_blocks_for_outputs(ccv_sparse_matrix_t* const exec_dep, ccv_nnc_tensor_block_t* const tensor_blocks, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const int unroll_count, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const int p_idx, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int* const dup_exec_ref, const int* const dup_tensor_block_ref)
2854{
2855 int i, j, k;
2856 for (i = 0; i < p_node_info->output_size; i++)
2857 {
2858 const int d = p_node_info->outputs[i];
2859 const int s_ref = *(int*)ccv_array_get(p_tensor_symbol_info[d].s_ref, p_idx)((void*)(((char*)((p_tensor_symbol_info[d].s_ref)->data)) +
(size_t)(p_tensor_symbol_info[d].s_ref)->rsize * (size_t)
(p_idx)))
- 1;
2860 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[s_ref])(!((tensor_blocks[s_ref].flags & 0x3) == ALIAS) &&
!((tensor_blocks[s_ref].flags & 0x3) == UNASSIGNED))
)
2861 continue;
2862 for (k = 0; k < destination_size; k++)
2863 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[k].d, tensor_blocks[s_ref]);
2864 // Add the duplicated destinations to the tensor_block_ref.
2865 for (j = 0; j < unroll_count; j++)
2866 for (k = 0; k < destination_size; k++)
2867 {
2868 const int dup_exec_idx = dup_exec_ref[destinations[k].d * unroll_count + j];
2869 const int dup_tensor_block_idx = dup_tensor_block_ref[s_ref * unroll_count + j];
2870 if (dup_exec_idx >= 0 && dup_tensor_block_idx >= 0)
2871 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_idx, tensor_blocks[dup_tensor_block_idx]);
2872 }
2873 }
2874}
2875
2876static void _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_graph_exec_flag_t* const exec_flags, ccv_sparse_matrix_t** r_exec_dep, ccv_nnc_tensor_block_t** r_tensor_blocks, int* r_tensor_block_size, ccv_nnc_symbolic_graph_t** r_dup_graph, int* r_unroll_count, int** r_dup_exec_ref, int** r_dup_tensor_block_ref)
2877{
2878 int i, j;
2879 ccv_sparse_matrix_t* exec_dep = *r_exec_dep;
2880 ccv_nnc_tensor_block_t* tensor_blocks = *r_tensor_blocks;
2881 // blocks that cannot be simply solved with either in-place operation tensor block folding or using the same memory region.
2882 // Unfortunately, I cannot do this analysis to the block folding done for sub-graphs, because we do sub-graph placement later.
2883 // No need to change anything, we are good.
2884 const int unroll_count = _ccv_nnc_exec_dep_and_tensor_blocks_unroll_count(symbolic_graph, tensor_symbol_info, exec_dep, tensor_blocks);
2885 if (!unroll_count)
2886 return;
2887 // Have conditions that cannot be satisfied with simple solution (allocate to the same memory region).
2888 // Doing graph expansion, first duplicate the old graph, but replace all sub graphs with noop.
2889 ccv_nnc_symbolic_graph_t* dup_graph = ccv_nnc_symbolic_graph_dup(symbolic_graph, _ccv_nnc_subst_sub_graph_with_noop);
2890 int* dup_exec_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->exec_symbol_info->rnum * unroll_count);
2891 int* dup_tensor_block_ref = (int*)ccmallocmalloc(sizeof(int) * symbolic_graph->tensor_symbol_info->rnum * unroll_count);
2892 _ccv_nnc_exec_dep_and_tensor_blocks_unroll_n(symbolic_graph, visit, unroll_count, exec_symbol_info, tensor_symbol_info, exec_dep, tensor_blocks, dup_graph, dup_tensor_block_ref, dup_exec_ref);
2893 ccv_nnc_tensor_symbol_info_t* const dup_tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * dup_graph->tensor_symbol_info->rnum);
2894 ccv_nnc_graph_exec_symbol_info_t* const dup_exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * dup_graph->exec_symbol_info->rnum);
2895 ccv_nnc_graph_visit_t* dup_visit = ccv_nnc_graph_visit_new(dup_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(dup_graph->exec_symbol_info, 0), dup_graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0), dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0), dup_graph->destinations->rnum, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((dup_graph->exec_symbol_info->rnum) - 1)); _visit_->
size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c
; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (dup_graph->exec_symbol_info->
rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t*
)((void*)(((char*)((dup_graph->exec_symbol_info)->data)
) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_ = (
(dup_graph->exec_symbol_info->rnum) + _incoming_edges_ >
1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_
= (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * (
dup_graph->exec_symbol_info->rnum) + sizeof(int32_t) * (
(dup_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->
rnum) + sizeof(int32_t) * ((dup_graph->exec_symbol_info->
rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (dup_graph->exec_symbol_info->rnum
)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (dup_graph
->exec_symbol_info->rnum)) + (dup_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (dup_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (dup_graph
->sources->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = {
(dup_graph->sources->rnum), 0, }; int _p_ = 0, _q_ = 1
; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for
(_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t
_idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue
; _incomings_[_idx_].r = 2; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); ++_incomings_
[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r =
1; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_++) {
((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char
*)((dup_graph->sources)->data)) + (size_t)(dup_graph->
sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ?
1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)
((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->sources)->data)) + (size_t)(dup_graph->sources
)->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (dup_graph
->sources->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1
; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 3) continue; _incomings_[_idx_].r = 4; if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum; _j_++) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_))); if (_incomings_
[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_
[d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges -
1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_
[d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof ((
_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum
)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (dup_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_
[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((
dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d; } _exist_size_
[0] = (dup_graph->destinations->rnum); _exist_size_[1] =
0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) {
const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_
].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (dup_graph->destinations->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->destinations)->data)) + (size_t)
(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].
graph == dup_graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].d = 1; }
for (_i_ = 0; _i_ < (dup_graph->sources->rnum); _i_
++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void*
)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph
== dup_graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph->sources)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->sources)->data)) + (size_t)(dup_graph
->sources)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_
= 1; _exist_size_[0] = (dup_graph->sources->rnum); _exist_size_
[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_
[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const
int32_t _idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_
->size].index = ((_idx_)); _visit_->node[_visit_->size
].term = ((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings->rnum == 1) { const int d = *(int
*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t*)((void
*)(((char*)((dup_graph->exec_symbol_info)->data)) + (size_t
)(dup_graph->exec_symbol_info)->rsize * (size_t)(0))))[
_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((dup_graph->exec_symbol_info)->data
)) + (size_t)(dup_graph->exec_symbol_info)->rsize * (size_t
)(0))))[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_
[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r ==
6 && _d_ < (dup_graph->destinations->rnum))
{ _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_
< ((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((
(ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((dup_graph
->exec_symbol_info)->data)) + (size_t)(dup_graph->exec_symbol_info
)->rsize * (size_t)(0))))[_idx_].outgoings)->data)) + (
size_t)(((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)
((dup_graph->exec_symbol_info)->data)) + (size_t)(dup_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
)->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_
[d].c == 0 && _incomings_[d].r == 6 && _d_ <
(dup_graph->destinations->rnum)) { ((void) sizeof ((_exist_size_
[_q_] < (dup_graph->exec_symbol_info->rnum)) ? 1 : 0
), __extension__ ({ if (_exist_size_[_q_] < (dup_graph->
exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (dup_graph->destinations->
rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].graph == dup_graph) ? 1 : 0), __extension__ ({ if ((
(ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].graph == dup_graph) ; else __assert_fail
("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].graph == dup_graph"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void
*)(((char*)((dup_graph->destinations)->data)) + (size_t
)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_]
.d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_
[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->
destinations)->data)) + (size_t)(dup_graph->destinations
)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__
({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)((
(char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph
->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0)
; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((dup_graph->destinations)->data)) + (size_t)(dup_graph->destinations)->rsize * (size_t)(0))))[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t*
)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
)[_i_].d].c > 0) continue; _visit_->node[_visit_->size
].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)(
(dup_graph->destinations)->data)) + (size_t)(dup_graph->
destinations)->rsize * (size_t)(0))))[_i_].d)); _visit_->
node[_visit_->size].term = ((_incomings_[((ccv_nnc_graph_exec_symbol_t
*)((void*)(((char*)((dup_graph->destinations)->data)) +
(size_t)(dup_graph->destinations)->rsize * (size_t)(0)
)))[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_) free(
_incomings_); } while (0);; ((void) sizeof ((_visit_->size
<= (dup_graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__
({ if (_visit_->size <= (dup_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_visit_->size <= (dup_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 2895, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
2896 ccv_nnc_symbolic_graph_symbol_infer(dup_graph, dup_visit, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_tensor_symbol_info, dup_exec_symbol_info);
2897 _ccv_nnc_fixup_assign_ref_after_unroll(symbolic_graph, unroll_count, tensor_blocks, dup_tensor_block_ref, dup_tensor_symbol_info);
2898 // Free out the old exec_dep
2899 ccv_matrix_free(exec_dep);
2900 // and the tensor blocks, prepare for the new.
2901 _ccv_nnc_tensor_blocks_free(tensor_blocks, symbolic_graph->tensor_symbol_info->rnum);
2902 // A reverse map to find where the original tensor comes from.
2903 int* dup_tensor_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->tensor_symbol_info->rnum);
2904 for (i = 0; i < dup_graph->tensor_symbol_info->rnum; i++)
2905 dup_tensor_from_ref[i] = -1;
2906 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2907 for (j = 0; j < unroll_count; j++)
2908 if (dup_tensor_block_ref[i * unroll_count + j] >= 0)
2909 dup_tensor_from_ref[dup_tensor_block_ref[i * unroll_count + j]] = i;
2910 int* dup_exec_from_ref = (int*)ccmallocmalloc(sizeof(int) * dup_graph->exec_symbol_info->rnum);
2911 for (i = 0; i < dup_graph->exec_symbol_info->rnum; i++)
2912 dup_exec_from_ref[i] = -1;
2913 for (i = 0; i < symbolic_graph->exec_symbol_info->rnum; i++)
2914 {
2915 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_symbol_info[i].flags)((exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD))
2916 continue;
2917 dup_exec_from_ref[i] = i; // Reference back.
2918 for (j = 0; j < unroll_count; j++)
2919 if (dup_exec_ref[i * unroll_count + j] >= 0)
2920 dup_exec_from_ref[dup_exec_ref[i * unroll_count + j]] = i;
2921 }
2922 // Reset all attr.
2923 memset(exec_flags, 0, sizeof(ccv_nnc_graph_exec_flag_t) * symbolic_graph->exec_symbol_info->rnum);
2924 _ccv_nnc_exec_dep_and_tensor_blocks_prep(dup_graph, p_node_info, dup_visit, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->sources, 0)((void*)(((char*)((dup_graph->sources)->data)) + (size_t
)(dup_graph->sources)->rsize * (size_t)(0)))
, dup_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_graph->destinations, 0)((void*)(((char*)((dup_graph->destinations)->data)) + (
size_t)(dup_graph->destinations)->rsize * (size_t)(0)))
, dup_graph->destinations->rnum, p_tensor_symbol_info, p_tensor_symbol_info_size, dup_exec_symbol_info, dup_tensor_symbol_info, unroll_count, dup_tensor_block_ref, dup_tensor_from_ref, dup_exec_from_ref, exec_flags, &exec_dep, &tensor_blocks);
2925 ccv_nnc_graph_visit_free(dup_visit);
2926 ccfreefree(dup_exec_symbol_info);
2927 ccfreefree(dup_exec_from_ref);
2928 ccfreefree(dup_tensor_from_ref);
2929 // Assign out dup_p_ref, which will be used to extended the anonymous block life-time.
2930 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2931 // Loop over all possible duplications to assign dup_p_ref properly.
2932 for (j = 0; j < unroll_count; j++)
2933 {
2934 const int dup_idx = dup_tensor_block_ref[j + i * unroll_count];
2935 if (dup_idx >= 0 && (tensor_blocks[i].p_refs[0] || tensor_blocks[i].p_refs[1]))
2936 {
2937 const int p_ref_0 = tensor_blocks[i].p_refs[0] - 1;
2938 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
2939 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
2940 {
2941 if (!tensor_blocks[dup_idx].dup_p_refs)
2942 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2943 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_0);
2944 }
2945 if (p_ref_0_is_in_or_out == 1 || tensor_blocks[i].p_refs[1] == 0)
2946 continue;
2947 const int p_ref_1 = tensor_blocks[i].p_refs[1] - 1;
2948 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, p_node_info);
2949 if (p_ref_1_is_in_or_out == 1)
2950 {
2951 if (!tensor_blocks[dup_idx].dup_p_refs)
2952 tensor_blocks[dup_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
2953 ccv_array_add_unique_int(tensor_blocks[dup_idx].dup_p_refs, p_ref_1);
2954 }
2955 }
2956 }
2957 // companion_ref
2958 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
2959 // Now can assign them (The dup) as companion.
2960 if (!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[i])((tensor_blocks[i].flags & 0x3) == UNASSIGNED) && dup_tensor_symbol_info[i].assign_ref)
2961 {
2962 // Get to the last one, which we will wrap over.
2963 const int assign_ref = dup_tensor_symbol_info[i].assign_ref - 1;
2964 if (assign_ref >= 0)
2965 {
2966 int b_ref = assign_ref;
2967 while (tensor_blocks[b_ref].ref)
2968 b_ref = tensor_blocks[b_ref].ref - 1;
2969 int a_hop_b = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[i], tensor_blocks[b_ref]);
2970 int b_hop_a = _ccv_nnc_tensor_block_head_after_tail(exec_dep, tensor_blocks[b_ref], tensor_blocks[i]);
2971 // It cannot be that both i can hop to j can j can hop to i.
2972 // And it can be hop from one to another now after duplication.
2973 assert(a_hop_b > 0 || b_hop_a > 0)((void) sizeof ((a_hop_b > 0 || b_hop_a > 0) ? 1 : 0), __extension__
({ if (a_hop_b > 0 || b_hop_a > 0) ; else __assert_fail
("a_hop_b > 0 || b_hop_a > 0", "ccv_nnc_symbolic_graph_compile.c"
, 2973, __extension__ __PRETTY_FUNCTION__); }))
;
2974 tensor_blocks[i].companion_ref = b_ref + 1;
2975 tensor_blocks[b_ref].companion_ref = i + 1;
2976 }
2977 }
2978 ccfreefree(dup_tensor_symbol_info);
2979 // Extend the dup tensor block ref, prepare for future extensions.
2980 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * dup_graph->tensor_symbol_info->rnum * unroll_count);
2981 for (i = symbolic_graph->tensor_symbol_info->rnum * unroll_count; i < dup_graph->tensor_symbol_info->rnum * unroll_count; i++)
2982 dup_tensor_block_ref[i] = -1;
2983 // Assign out changed properties.
2984 *r_exec_dep = exec_dep;
2985 *r_tensor_blocks = tensor_blocks;
2986 *r_tensor_block_size = dup_graph->tensor_symbol_info->rnum;
2987 *r_dup_graph = dup_graph;
2988 *r_unroll_count = unroll_count;
2989 *r_dup_exec_ref = dup_exec_ref;
2990 *r_dup_tensor_block_ref = dup_tensor_block_ref;
2991}
2992
2993static int _ccv_nnc_anonymous_tensor_block_from_free_list(const ccv_nnc_tensor_block_t* const tensor_blocks, const int tensor_block_size, const ccv_array_t* const anonymous_block_free_list, const int anonymous_block_free_list_cap, const int type, const uint64_t size, const ccv_sparse_matrix_t* const exec_dep, const ccv_array_t* const dup_p_refs)
2994{
2995 if (!anonymous_block_free_list || !anonymous_block_free_list_cap)
2996 return tensor_block_size;
2997 int i;
2998 const int no_dup_p_refs = (!dup_p_refs || !dup_p_refs->rnum);
2999 int found_idx = tensor_block_size;
3000 for (i = 0; i < anonymous_block_free_list_cap; i++)
3001 {
3002 const int idx = *(int*)ccv_array_get(anonymous_block_free_list, i)((void*)(((char*)((anonymous_block_free_list)->data)) + (size_t
)(anonymous_block_free_list)->rsize * (size_t)(i)))
;
3003 assert(idx < tensor_block_size)((void) sizeof ((idx < tensor_block_size) ? 1 : 0), __extension__
({ if (idx < tensor_block_size) ; else __assert_fail ("idx < tensor_block_size"
, "ccv_nnc_symbolic_graph_compile.c", 3003, __extension__ __PRETTY_FUNCTION__
); }))
;
3004 // If the type doesn't match, ignore.
3005 if (tensor_blocks[idx].type != type)
3006 continue;
3007 // Heuristic about how to select the best tensor block to move forward.
3008 // If the size is larger, and no dup_p_refs, found, I cannot do better than this, just return directly.
3009 if (tensor_blocks[idx].size >= size)
3010 {
3011 if (no_dup_p_refs)
3012 return idx;
3013 // Otherwise, only if the current tensor block's dup_p_refs is after (or at) the dup_p_refs,
3014 // then we cannot do better than this, if that is the case, just return.
3015 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum &&
3016 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs))
3017 return idx;
3018 }
3019 int64_t found_idx_size_diff;
3020 int64_t idx_size_diff;
3021 if (found_idx == tensor_block_size || // If no found_idx yet, set the current one to be the found one, and continue.
3022 // Now, compare whether this one or the found_idx one is better.
3023 // At this point, there is no point of comparing the dup_p_refs, we only care about which one
3024 // is closer to the size we request. Only on a tie, dup_p_refs or not is important again.
3025 (found_idx_size_diff = llabs((int64_t)tensor_blocks[found_idx].size - (int64_t)size)) < (idx_size_diff = llabs((int64_t)tensor_blocks[idx].size - (int64_t)size)))
3026 {
3027 found_idx = idx;
3028 continue;
3029 }
3030 // No need to update if found_idx is better than idx.
3031 if (found_idx_size_diff > idx_size_diff)
3032 continue;
3033 // We bias towards the bigger one in case of similar.
3034 if (found_idx_size_diff == idx_size_diff && tensor_blocks[idx].size > tensor_blocks[found_idx].size)
3035 {
3036 found_idx = idx;
3037 continue;
3038 }
3039 assert(tensor_blocks[idx].size == tensor_blocks[found_idx].size)((void) sizeof ((tensor_blocks[idx].size == tensor_blocks[found_idx
].size) ? 1 : 0), __extension__ ({ if (tensor_blocks[idx].size
== tensor_blocks[found_idx].size) ; else __assert_fail ("tensor_blocks[idx].size == tensor_blocks[found_idx].size"
, "ccv_nnc_symbolic_graph_compile.c", 3039, __extension__ __PRETTY_FUNCTION__
); }))
;
3040 // On a tie, check which one has tighter life-cycle.
3041 if (tensor_blocks[idx].size >= size) // If this block size covers the size we request, we prefer longer life-cycle ones.
3042 {
3043 // Check whether the current tensor blocks life-cycle is longer than the previous one.
3044 if (tensor_blocks[idx].dup_p_refs && tensor_blocks[idx].dup_p_refs->rnum > 0 &&
3045 (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum ||
3046 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3047 found_idx = idx;
3048 continue;
3049 }
3050 // Now both our size is smaller than requested size, in this case, we need to increase the tensor block size.
3051 // We prefer to choose the one that has life-cycle closer to the expected ones.
3052 if (no_dup_p_refs)
3053 {
3054 // Whoever is shorter wins.
3055 if (tensor_blocks[found_idx].dup_p_refs && tensor_blocks[found_idx].dup_p_refs->rnum > 0 &&
3056 (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum ||
3057 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs)))
3058 found_idx = idx;
3059 continue;
3060 }
3061 if (!tensor_blocks[idx].dup_p_refs || !tensor_blocks[idx].dup_p_refs->rnum)
3062 continue;
3063 if (!tensor_blocks[found_idx].dup_p_refs || !tensor_blocks[found_idx].dup_p_refs->rnum)
3064 {
3065 found_idx = idx;
3066 continue;
3067 }
3068 // If both covers the request dup_p_refs, we prefer the shorter one, otherwise we prefer the longer one.
3069 const int idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, dup_p_refs);
3070 const int found_idx_after_request = _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, dup_p_refs);
3071 if (idx_after_request && found_idx_after_request)
3072 {
3073 if (_ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[found_idx].dup_p_refs, tensor_blocks[idx].dup_p_refs))
3074 found_idx = idx;
3075 continue;
3076 } else {
3077 // We entered this branch must be either idx_after_request is false or found_idx_after_request is false or both.
3078 // If found_idx_after_request is not false, we are currently doing fine, no need to proceed.
3079 // Otherwise, if idx_after_request is true, it is preferred. If both are false, then prefer the longer one.
3080 if (!found_idx_after_request && (idx_after_request ||
3081 _ccv_nnc_tensor_block_a_after_b_inclusively(exec_dep, tensor_blocks[idx].dup_p_refs, tensor_blocks[found_idx].dup_p_refs)))
3082 found_idx = idx;
3083 continue;
3084 }
3085 }
3086 return found_idx;
3087}
3088
3089static ccv_array_t* _ccv_nnc_dup_breakpoints_with_p_node_inputs(ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_info_t* const p_node_info)
3090{
3091 if (!(p_node_info && (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)))
3092 return 0;
3093 int i, j, k;
3094 int input_size = 0;
3095 for (i = 0; i < p_node_info->p_while.input_size; i++)
3096 if (p_node_info->p_while.inputs[i] >= 0)
3097 ++input_size;
3098 // If doesn't have tensor inputs (thus, only special inputs), just return.
3099 if (!input_size)
3100 return 0;
3101 ccv_nnc_tensor_symbol_t inputs[input_size];
3102 input_size = 0;
3103 for (i = 0; i < p_node_info->p_while.input_size; i++)
3104 if (p_node_info->p_while.inputs[i] >= 0)
3105 inputs[input_size++] = (ccv_nnc_tensor_symbol_t){
3106 .d = p_node_info->p_while.inputs[i],
3107 .graph = symbolic_graph,
3108 };
3109 assert(symbolic_graph->breakpoint_size > 0)((void) sizeof ((symbolic_graph->breakpoint_size > 0) ?
1 : 0), __extension__ ({ if (symbolic_graph->breakpoint_size
> 0) ; else __assert_fail ("symbolic_graph->breakpoint_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3109, __extension__ __PRETTY_FUNCTION__
); }))
;
3110 ccv_array_t* dup_breakpoints = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), symbolic_graph->breakpoint_size, 0);
3111 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3112 for (i = 0; i < symbolic_graph->breakpoint_size; i++)
3113 {
3114 // Make a noop copy of the breakpoint, but with some tensor inputs.
3115 ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(symbolic_graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), inputs, input_size, 0, 0, 0);
3116 ccv_array_push(dup_breakpoints, &noop);
3117 // Connect this noop to the outgoing nodes of breakpoints.
3118 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, symbolic_graph->breakpoints[i].d)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(symbolic_graph->breakpoints[i].d)))
;
3119 if (symbol_info->outgoings)
3120 for (j = 0; j < symbol_info->outgoings->rnum; j++)
3121 {
3122 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
3123 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, noop, (ccv_nnc_graph_exec_symbol_t){
3124 .d = d,
3125 .graph = symbolic_graph,
3126 });
3127 }
3128 }
3129 for (i = 0; i < exec_symbol_info_size; i++)
3130 {
3131 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(i)))
;
3132 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(symbol_info->flags)((symbol_info->flags) & CCV_NNC_GRAPH_EXEC_DEAD))
3133 continue;
3134 if (symbol_info->outgoings)
3135 {
3136 const int outgoing_size = symbol_info->outgoings->rnum;
3137 for (j = 0; j < outgoing_size; j++)
3138 {
3139 const int d = *(int*)ccv_array_get(symbol_info->outgoings, j)((void*)(((char*)((symbol_info->outgoings)->data)) + (size_t
)(symbol_info->outgoings)->rsize * (size_t)(j)))
;
3140 for (k = 0; k < symbolic_graph->breakpoint_size; k++)
3141 if (d == symbolic_graph->breakpoints[k].d)
3142 {
3143 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, k)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(k)))
;
3144 ccv_nnc_graph_exec_symbol_concat(symbolic_graph, (ccv_nnc_graph_exec_symbol_t){
3145 .d = i,
3146 .graph = symbolic_graph,
3147 }, noop);
3148 // Found, connected, exit.
3149 break;
3150 }
3151 }
3152 }
3153 }
3154 // Add the dup_breakpoints to source if neccessary.
3155 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 3155, __extension__ __PRETTY_FUNCTION__
); }))
;
3156 const int source_size = symbolic_graph->sources->rnum;
3157 for (i = 0; i < source_size; i++)
3158 {
3159 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, i)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(i)))
)->d;
3160 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3161 if (d == symbolic_graph->breakpoints[j].d)
3162 {
3163 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
3164 ccv_nnc_symbolic_graph_add_source(symbolic_graph, noop);
3165 // Found, made, exit.
3166 break;
3167 }
3168 }
3169 // Add the dup_breakpoints to destination if neccessary.
3170 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 3170, __extension__ __PRETTY_FUNCTION__); }))
;
3171 const int destination_size = symbolic_graph->destinations->rnum;
3172 for (i = 0; i < destination_size; i++)
3173 {
3174 const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, i)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(i)))
)->d;
3175 for (j = 0; j < symbolic_graph->breakpoint_size; j++)
3176 if (d == symbolic_graph->breakpoints[j].d)
3177 {
3178 ccv_nnc_graph_exec_symbol_t noop = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(dup_breakpoints, j)((void*)(((char*)((dup_breakpoints)->data)) + (size_t)(dup_breakpoints
)->rsize * (size_t)(j)))
;
3179 ccv_nnc_symbolic_graph_add_destination(symbolic_graph, noop);
3180 // Found, made, exit.
3181 break;
3182 }
3183 }
3184 return dup_breakpoints;
3185}
3186
3187// Plan out how we allocate tensor (should I do optimizations on graph here or not at all?).
3188static ccv_nnc_symbolic_graph_prep_t* _ccv_nnc_symbolic_graph_prep_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_info_t* const p_tensor_symbol_info, const int p_tensor_symbol_info_size, const ccv_nnc_graph_exec_symbol_info_t* const p_exec_symbol_info, const int p_exec_symbol_info_size)
3189{
3190 assert(source_size > 0)((void) sizeof ((source_size > 0) ? 1 : 0), __extension__ (
{ if (source_size > 0) ; else __assert_fail ("source_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3190, __extension__ __PRETTY_FUNCTION__
); }))
;
3191 assert(destination_size > 0)((void) sizeof ((destination_size > 0) ? 1 : 0), __extension__
({ if (destination_size > 0) ; else __assert_fail ("destination_size > 0"
, "ccv_nnc_symbolic_graph_compile.c", 3191, __extension__ __PRETTY_FUNCTION__
); }))
;
3192 // First, fill all the "auto" holes.
3193 // This is the symbol table that with "auto" info filled up.
3194 ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * symbolic_graph->tensor_symbol_info->rnum);
3195 ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * symbolic_graph->exec_symbol_info->rnum);
3196 ccv_nnc_graph_exec_flag_t* exec_flags = (ccv_nnc_graph_exec_flag_t*)cccalloccalloc(symbolic_graph->exec_symbol_info->rnum, sizeof(ccv_nnc_graph_exec_flag_t));
3197 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
= (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3197, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
} _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
__assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3197, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
_exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 3197, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
_d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
_idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 3197, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
3198 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, exec_symbol_info);
3199 int i, j, k, p, q;
3200 const ccv_nnc_graph_exec_symbol_info_t* const p_node_info = p_exec_symbol_info ? p_exec_symbol_info + (symbolic_graph->exec_idx - 1) : 0;
3201 ccv_sparse_matrix_t* exec_dep;
3202 ccv_nnc_tensor_block_t* tensor_blocks;
3203 _ccv_nnc_exec_dep_and_tensor_blocks_prep(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, 0, 0, 0, 0, exec_flags, &exec_dep, &tensor_blocks);
3204 int tensor_block_size = symbolic_graph->tensor_symbol_info->rnum;
3205 // Now, everything is prepared, tensor life is analyzed, inplace operations are collapsed, all tensor symbols and hints
3206 // are automatically filled in, and all the sub-graphs are processed.
3207 // There is a last step though, for a while loop, it is parameterized:
3208 // while (x > 5) {
3209 // y = x + 1;
3210 // } (y => x) // This means after this loop is done, y's value will be copied over to x.
3211 // we will do our best to avoid to do the actual data copy, what we do here is to check whether y can be x's alias.
3212 // If y can be x's alias, this is good, no other changes required. In above case, y can be x's alias because
3213 // it is a inplace operation.
3214 // But if y cannot be x's alias, for example, this while loop looks like this:
3215 // while (x > 5) {
3216 // y = x + a
3217 // b = x + y
3218 // } (y => x, b => a) // This means after this loop is done, y's value copied to x and b's value copied to a.
3219 // For this example, y cannot be x's alias because x is used later to compute b (and that computation
3220 // has dependency on y as well).
3221 // For this case, we need to modify the computation graph. Previously, the graph looks like this:
3222 // y = x + a -> b = x + y
3223 // This graph will be extended to look like this:
3224 // y0 = x0 + a0 -> b0 = x0 + y0 -> y1 = y0 + b0 -> b1 = y0 + y1, or:
3225 // while (x0 > 5) {
3226 // y0 = x0 + a0
3227 // b0 = x0 + y0
3228 // if (y0 > 5) break
3229 // y1 = y0 + b0
3230 // b1 = y0 + y1
3231 // } (y1 => x0, b1 => a0)
3232 // After this expansion, y1 now can be the alias of x0, as well as b1 can be alias of a0 (they don't interfere
3233 // with each other now).
3234 // With this algorithm, we don't need to insert any data copy logic, the only thing need is to switch pointers
3235 // which is covered by the tensor_multiview_t construct (thus, y (y0, y1), x (y1, y0), b (b0, b1), a (b1, b0))
3236 ccv_nnc_symbolic_graph_t* dup_graph = 0;
3237 int* dup_exec_ref = 0;
3238 int* dup_tensor_block_ref = 0;
3239 int unroll_count = 0;
3240 // In true recursive fashion, I need to call all the sub graphs and do the pre compilation for them one by one.
3241 ccv_nnc_symbolic_graph_prep_t* prep = (ccv_nnc_symbolic_graph_prep_t*)ccmallocmalloc(sizeof(ccv_nnc_symbolic_graph_prep_t));
3242 prep->graph = ccv_nnc_graph_new(); // Just allocate the graph right now.
3243 prep->flags = 0;
3244 // Cannot handle dup a node that is a graph as well.
3245 if (p_exec_symbol_info)
3246 {
3247 prep->flags = p_node_info->flags;
3248 if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3249 {
3250 _ccv_nnc_redo_exec_dep_and_tensor_blocks_when_unroll(symbolic_graph, p_node_info, visit, tensor_binds, tensor_bind_size, sources, source_size, destinations, destination_size, p_tensor_symbol_info, p_tensor_symbol_info_size, exec_symbol_info, tensor_symbol_info, exec_flags, &exec_dep, &tensor_blocks, &tensor_block_size, &dup_graph, &unroll_count, &dup_exec_ref, &dup_tensor_block_ref);
3251 _ccv_nnc_fixup_tensor_blocks_for_outputs(exec_dep, tensor_blocks, p_node_info, unroll_count, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
, symbolic_graph->destinations->rnum, symbolic_graph->p_idx - 1, p_tensor_symbol_info, p_tensor_symbol_info_size, tensor_symbol_info, dup_exec_ref, dup_tensor_block_ref);
3252 } else if (p_node_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3253 // TODO: We want to try our best to fit as much of its corresponding inputs / outputs into companion_ref group.
3254 }
3255 }
3256 ccv_nnc_symbolic_graph_prep_t** sub_preps = symbolic_graph->sub_graphs && symbolic_graph->sub_graphs->rnum ? (ccv_nnc_symbolic_graph_prep_t**)cccalloccalloc(symbolic_graph->sub_graphs->rnum, sizeof(ccv_nnc_symbolic_graph_prep_t*)) : 0;
3257 ccv_array_t* anonymous_block_free_list = 0;
3258 const int tensor_fold_size = (tensor_block_size + 31) >> 5;
3259 // Record whether this tensor is folded in this round.
3260 uint32_t* const tensor_fold = (uint32_t*)ccmallocmalloc(sizeof(uint32_t) * tensor_fold_size);
3261 ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const
int idx __attribute__((unused)) = (visit)->node[_i_].index
; const int _node_unused_ __attribute__((unused)) = (visit)->
node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__
((unused)) = (exec_symbol_info) + idx;
{
3262 for (p = 0; p < node->graph_ref_size; p++)
3263 {
3264 assert(symbolic_graph->sub_graphs)((void) sizeof ((symbolic_graph->sub_graphs) ? 1 : 0), __extension__
({ if (symbolic_graph->sub_graphs) ; else __assert_fail (
"symbolic_graph->sub_graphs", "ccv_nnc_symbolic_graph_compile.c"
, 3264, __extension__ __PRETTY_FUNCTION__); }))
;
3265 ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[p] - 1)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (
node)->_inline_graph_ref)[p] - 1)))
;
3266 ccv_array_t* const dup_breakpoints = _ccv_nnc_dup_breakpoints_with_p_node_inputs(sub_graph, node);
3267 ccv_nnc_symbolic_graph_prep_t* const sub_prep = _ccv_nnc_symbolic_graph_prep_new(sub_graph, tensor_binds, tensor_bind_size, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->sources, 0)((void*)(((char*)((sub_graph->sources)->data)) + (size_t
)(sub_graph->sources)->rsize * (size_t)(0)))
, sub_graph->sources->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_graph->destinations, 0)((void*)(((char*)((sub_graph->destinations)->data)) + (
size_t)(sub_graph->destinations)->rsize * (size_t)(0)))
, sub_graph->destinations->rnum, tensor_symbol_info, symbolic_graph->tensor_symbol_info->rnum, exec_symbol_info, symbolic_graph->exec_symbol_info->rnum);
3268 sub_prep->dup_breakpoints = dup_breakpoints;
3269 sub_prep->p = prep;
3270 sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1] = sub_prep;
3271 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3272 const ccv_nnc_tensor_block_t* const s_tensor_blocks = sub_prep->tensor_blocks;
3273 for (i = 0; i < s_alloc_prep->block_size; i++)
3274 {
3275 const int block_ref = s_alloc_prep->blocks[i].block_ref;
3276 const int buffer_ref = s_alloc_prep->blocks[i].buffer_ref;
3277 if (block_ref < sub_prep->tensor_symbol_info_size)
3278 {
3279 // If this block has a bypass, and its bypass has a different p_refs, then it doesn't matter.
3280 // I cannot assign p_refs to its parent buffer, and that buffer has to be anonymous.
3281 if (s_tensor_blocks[block_ref].bypass_ref)
3282 {
3283 int bypass_ref = s_tensor_blocks[block_ref].bypass_ref - 1;
3284 while (s_tensor_blocks[bypass_ref].ref)
3285 bypass_ref = s_tensor_blocks[bypass_ref].ref - 1;
3286 if (s_tensor_blocks[block_ref].p_refs[0] != s_tensor_blocks[bypass_ref].p_refs[0] ||
3287 s_tensor_blocks[block_ref].p_refs[1] != s_tensor_blocks[bypass_ref].p_refs[1])
3288 continue;
3289 }
3290 if (s_tensor_blocks[block_ref].p_refs[0])
3291 {
3292 /* If it is already properly assigned, next. */
3293 if (s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[0] &&
3294 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[0])
3295 {
3296 if (!s_alloc_prep->buffers[buffer_ref].p_refs[0])
3297 s_alloc_prep->buffers[buffer_ref].p_refs[0] = s_tensor_blocks[block_ref].p_refs[0];
3298 else {
3299 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3299, __extension__ __PRETTY_FUNCTION__
); }))
;
3300 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[0];
3301 }
3302 }
3303 /* When entering this branch, s_alloc_prep->buffers[buffer_ref].p_refs[0] cannot be 0. */
3304 if (s_tensor_blocks[block_ref].p_refs[1] &&
3305 s_alloc_prep->buffers[buffer_ref].p_refs[0] != s_tensor_blocks[block_ref].p_refs[1] &&
3306 s_alloc_prep->buffers[buffer_ref].p_refs[1] != s_tensor_blocks[block_ref].p_refs[1])
3307 {
3308 assert(s_alloc_prep->buffers[buffer_ref].p_refs[0])((void) sizeof ((s_alloc_prep->buffers[buffer_ref].p_refs[
0]) ? 1 : 0), __extension__ ({ if (s_alloc_prep->buffers[buffer_ref
].p_refs[0]) ; else __assert_fail ("s_alloc_prep->buffers[buffer_ref].p_refs[0]"
, "ccv_nnc_symbolic_graph_compile.c", 3308, __extension__ __PRETTY_FUNCTION__
); }))
;
3309 assert(!s_alloc_prep->buffers[buffer_ref].p_refs[1])((void) sizeof ((!s_alloc_prep->buffers[buffer_ref].p_refs
[1]) ? 1 : 0), __extension__ ({ if (!s_alloc_prep->buffers
[buffer_ref].p_refs[1]) ; else __assert_fail ("!s_alloc_prep->buffers[buffer_ref].p_refs[1]"
, "ccv_nnc_symbolic_graph_compile.c", 3309, __extension__ __PRETTY_FUNCTION__
); }))
;
3310 s_alloc_prep->buffers[buffer_ref].p_refs[1] = s_tensor_blocks[block_ref].p_refs[1];
3311 }
3312 }
3313 } else if (s_tensor_blocks[block_ref].dup_p_refs) {
3314 /* In this case, only relevant bit is dup_p_ref. dup_p_ref extends the life-time of anonymous block
3315 * which by default only has life-cycle shared with this sub-graph node. The reason to extend is that
3316 * these anonymous blocks that has dup_p_ref may contain data that will be used as output (thus, dup_p_ref
3317 * always points to an output tensor of this sub-graph node) therefore, the memory region must extend
3318 * its life-time to the end of the output tensor. */
3319 if (!s_alloc_prep->buffers[buffer_ref].dup_p_refs)
3320 s_alloc_prep->buffers[buffer_ref].dup_p_refs = ccv_array_new(sizeof(int), s_tensor_blocks[block_ref].dup_p_refs->rnum, 0);
3321 for (j = 0; j < s_tensor_blocks[block_ref].dup_p_refs->rnum; j++)
3322 ccv_array_add_unique_int(s_alloc_prep->buffers[buffer_ref].dup_p_refs, *(int*)ccv_array_get(s_tensor_blocks[block_ref].dup_p_refs, j)((void*)(((char*)((s_tensor_blocks[block_ref].dup_p_refs)->
data)) + (size_t)(s_tensor_blocks[block_ref].dup_p_refs)->
rsize * (size_t)(j)))
);
3323 }
3324 }
3325 }
3326 const int init_tensor_block_size = tensor_block_size;
3327 int rw_anonymous_buffer_size_cap = 0;
3328 int ro_anonymous_buffer_size_cap = 0;
3329 if (anonymous_block_free_list)
3330 ccv_array_clear(anonymous_block_free_list);
3331 memset(tensor_fold, 0, sizeof(uint32_t) * tensor_fold_size);
3332 for (p = 0; p < node->graph_ref_size; p++)
3333 {
3334 ccv_nnc_symbolic_graph_prep_t* const sub_prep = sub_preps[CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[p] - 1];
3335 const ccv_nnc_tensor_alloc_prep_t* const s_alloc_prep = sub_prep->alloc_prep;
3336 int rw_anonymous_buffer_size = 0;
3337 int ro_anonymous_buffer_size = 0;
3338 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3339 if (s_alloc_prep->buffers[i].p_refs[0])
3340 {
3341 /* Reduce 2 p_refs, if it is, to 1 p_ref (by doing block folding). */
3342 int p_ref_0 = s_alloc_prep->buffers[i].p_refs[0] - 1;
3343 /* Need to go through refs. Since we reuse the tensor block for this input, it now has to have allocate at least this much space. */
3344 int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, node);
3345 assert(p_ref_0_is_in_or_out != 0)((void) sizeof ((p_ref_0_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_0_is_in_or_out != 0) ; else __assert_fail ("p_ref_0_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3345, __extension__ __PRETTY_FUNCTION__
); }))
;
3346 int unref_p_ref_0 = p_ref_0;
3347 while (tensor_blocks[unref_p_ref_0].ref)
3348 unref_p_ref_0 = tensor_blocks[unref_p_ref_0].ref - 1;
3349 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3350 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3350, __extension__ __PRETTY_FUNCTION__); }))
;
3351 if (s_alloc_prep->buffers[i].p_refs[1])
3352 {
3353 int p_ref_1 = s_alloc_prep->buffers[i].p_refs[1] - 1;
3354 const int p_ref_1_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_1, node);
3355 assert(p_ref_1_is_in_or_out != 0)((void) sizeof ((p_ref_1_is_in_or_out != 0) ? 1 : 0), __extension__
({ if (p_ref_1_is_in_or_out != 0) ; else __assert_fail ("p_ref_1_is_in_or_out != 0"
, "ccv_nnc_symbolic_graph_compile.c", 3355, __extension__ __PRETTY_FUNCTION__
); }))
;
3356 int unref_p_ref_1 = p_ref_1;
3357 while (tensor_blocks[unref_p_ref_1].ref)
3358 unref_p_ref_1 = tensor_blocks[unref_p_ref_1].ref - 1;
3359 /* See above comment for the similar p_ref_0 check. */
3360 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1]))((void) sizeof ((!((tensor_blocks[unref_p_ref_1].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_1].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_1])", "ccv_nnc_symbolic_graph_compile.c"
, 3360, __extension__ __PRETTY_FUNCTION__); }))
;
3361 assert(p_ref_0_is_in_or_out != p_ref_1_is_in_or_out)((void) sizeof ((p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ? 1 : 0), __extension__ ({ if (p_ref_0_is_in_or_out != p_ref_1_is_in_or_out
) ; else __assert_fail ("p_ref_0_is_in_or_out != p_ref_1_is_in_or_out"
, "ccv_nnc_symbolic_graph_compile.c", 3361, __extension__ __PRETTY_FUNCTION__
); }))
;
3362 int p_ref_t;
3363 if (p_ref_0_is_in_or_out < p_ref_1_is_in_or_out) /* if p_ref_0 is input and p_ref_1 is output, switch. */
3364 {
3365 CCV_SWAP(p_ref_0, p_ref_1, p_ref_t)((p_ref_t) = (p_ref_0), (p_ref_0) = (p_ref_1), (p_ref_1) = (p_ref_t
))
;
3366 CCV_SWAP(unref_p_ref_0, unref_p_ref_1, p_ref_t)((p_ref_t) = (unref_p_ref_0), (unref_p_ref_0) = (unref_p_ref_1
), (unref_p_ref_1) = (p_ref_t))
;
3367 }
3368 p_ref_0_is_in_or_out = 1; /* Now p_ref_0 surely is the output tensor. */
3369 /* If the dimension matches, can fold. TODO: shoud the dimension matches perfectly here? */
3370 if (memcmp(tensor_symbol_info[unref_p_ref_1].info.dim, tensor_symbol_info[unref_p_ref_0].info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC(12)) == 0)
3371 {
3372 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, unref_p_ref_1, unref_p_ref_0);
3373 if (folded)
3374 {
3375 p_ref_0 = p_ref_1;
3376 unref_p_ref_0 = unref_p_ref_1; // p_ref_0 now folded into p_ref_1, therefore, pointing to p_ref_1 now.
3377 tensor_fold[unref_p_ref_0 >> 5] |= (1u << (unref_p_ref_0 & 0x1f));
3378 for (j = 0; j < unroll_count; j++) /* Fold its duplicates as well. */
3379 {
3380 const int folded = _ccv_nnc_tensor_blocks_try_fold(tensor_blocks, dup_tensor_block_ref[unref_p_ref_1 * unroll_count + j], dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]);
3381 assert(folded && "the subsequent duplicates can be folded too.")((void) sizeof ((folded && "the subsequent duplicates can be folded too."
) ? 1 : 0), __extension__ ({ if (folded && "the subsequent duplicates can be folded too."
) ; else __assert_fail ("folded && \"the subsequent duplicates can be folded too.\""
, "ccv_nnc_symbolic_graph_compile.c", 3381, __extension__ __PRETTY_FUNCTION__
); }))
;
3382 }
3383 }
3384 }
3385 }
3386 /* Only proceed if it is folded here (thus, the input / output tensor can be connected, reuse is not a problem
3387 * Or if the p_ref_0 is the output, it is the first started from this node (thus, I have full control over
3388 * its life-cycle). Or if the p_ref_0 is the input, it is ended in this node (thus, I can take over i
3389 * life-cycle freely within this sub-graph (otherwise, if it is used anywhere, I cannot change the content
3390 * within its memory region)). Unless this buffer is used as read-only, and we don't have any output
3391 * associated with it, then we are good. */
3392 if ((tensor_fold[unref_p_ref_0 >> 5] & (1u << (unref_p_ref_0 & 0x1f))) ||
3393 (p_ref_0_is_in_or_out == 1 && _ccv_nnc_tensor_block_check_head(tensor_blocks + unref_p_ref_0, idx)) ||
3394 (p_ref_0_is_in_or_out == -1 && _ccv_nnc_tensor_block_check_tail(tensor_blocks + unref_p_ref_0, idx)) ||
3395 TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3396 {
3397 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3398 { assert(s_alloc_prep->buffers[i].p_refs[1] == 0)((void) sizeof ((s_alloc_prep->buffers[i].p_refs[1] == 0) ?
1 : 0), __extension__ ({ if (s_alloc_prep->buffers[i].p_refs
[1] == 0) ; else __assert_fail ("s_alloc_prep->buffers[i].p_refs[1] == 0"
, "ccv_nnc_symbolic_graph_compile.c", 3398, __extension__ __PRETTY_FUNCTION__
); }))
; }
3399 /* p_ref_0 is either the only one, or the output tensor, we always prefer the output tensor (there
3400 * is a long argument why that is the case, the digest is, it is much easier to control your output
3401 * than your input). */
3402 s_alloc_prep->buffers[i].p_refs[0] = p_ref_0 + 1;
3403 s_alloc_prep->buffers[i].p_refs[1] = 0;
3404 /* This parent tensor block cannot be unassigned because it is either input / output of this sub-graph node. */
3405 assert(!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0]))((void) sizeof ((!((tensor_blocks[unref_p_ref_0].flags & 0x3
) == UNASSIGNED)) ? 1 : 0), __extension__ ({ if (!((tensor_blocks
[unref_p_ref_0].flags & 0x3) == UNASSIGNED)) ; else __assert_fail
("!TENSOR_EXPECT_UNASSIGNED(tensor_blocks[unref_p_ref_0])", "ccv_nnc_symbolic_graph_compile.c"
, 3405, __extension__ __PRETTY_FUNCTION__); }))
;
3406 tensor_blocks[unref_p_ref_0].size = ccv_max(s_alloc_prep->buffers[i].size, tensor_blocks[unref_p_ref_0].size)({ typeof (s_alloc_prep->buffers[i].size) _a = (s_alloc_prep
->buffers[i].size); typeof (tensor_blocks[unref_p_ref_0].size
) _b = (tensor_blocks[unref_p_ref_0].size); (_a > _b) ? _a
: _b; })
;
3407 for (j = 0; j < unroll_count; j++) /* Change the size of its duplicates as well. */
3408 tensor_blocks[dup_tensor_block_ref[p_ref_0 * unroll_count + j]].size =
3409 tensor_blocks[dup_tensor_block_ref[unref_p_ref_0 * unroll_count + j]].size =
3410 tensor_blocks[unref_p_ref_0].size;
3411 } else {
3412 s_alloc_prep->buffers[i].p_refs[0] = s_alloc_prep->buffers[i].p_refs[1] = 0;
3413 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3414 ++ro_anonymous_buffer_size;
3415 else
3416 rw_anonymous_buffer_size += unroll_count + 1;
3417 }
3418 } else {
3419 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY)
3420 ++ro_anonymous_buffer_size;
3421 else
3422 rw_anonymous_buffer_size += unroll_count + 1;
3423 }
3424 if (ro_anonymous_buffer_size || rw_anonymous_buffer_size)
3425 {
3426 const int anonymous_block_free_list_cap = anonymous_block_free_list ? anonymous_block_free_list->rnum : 0;
3427 // All read-write buffer (potentially) can be reused between each case..of branch.
3428 rw_anonymous_buffer_size_cap += rw_anonymous_buffer_size;
3429 // Read-only buffer cannot be reused between each case..of branch.
3430 ro_anonymous_buffer_size_cap += ro_anonymous_buffer_size;
3431 /* Anonymous block, allocate additional tensor blocks for this. */
3432 /* This is either because this is an internal tensor (don't have p_ref) */
3433 /* or it is an anonymous block itself within the sub graphs of this while graph. */
3434 tensor_blocks = (ccv_nnc_tensor_block_t*)ccreallocrealloc(tensor_blocks, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3435 memset(tensor_blocks + tensor_block_size, 0, sizeof(ccv_nnc_tensor_block_t) * (init_tensor_block_size + (unroll_count + 1) * rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap - tensor_block_size));
3436 if (dup_tensor_block_ref)
3437 dup_tensor_block_ref = (int*)ccreallocrealloc(dup_tensor_block_ref, sizeof(int) * unroll_count * (init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap));
3438 for (i = 0; i < s_alloc_prep->buffer_size; i++)
3439 if (!s_alloc_prep->buffers[i].p_refs[0])
3440 {
3441 if (TENSOR_READ_WRITE(s_alloc_prep->buffers[i])(s_alloc_prep->buffers[i].flags & 0xc) == READ_ONLY) /* If it is read-only, add all sources (destinations) to it. */
3442 {
3443 assert(tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap)((void) sizeof ((tensor_block_size < init_tensor_block_size
+ rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ? 1 : 0), __extension__ ({ if (tensor_block_size < init_tensor_block_size
+ rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap
) ; else __assert_fail ("tensor_block_size < init_tensor_block_size + rw_anonymous_buffer_size_cap + ro_anonymous_buffer_size_cap"
, "ccv_nnc_symbolic_graph_compile.c", 3443, __extension__ __PRETTY_FUNCTION__
); }))
;
3444 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_size])(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0x10) | ANONYMOUS))
;
3445 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_size], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_size].flags = ((tensor_blocks[tensor_block_size
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3446 tensor_blocks[tensor_block_size].type = s_alloc_prep->buffers[i].type;
3447 tensor_blocks[tensor_block_size].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3448 tensor_blocks[tensor_block_size].size = s_alloc_prep->buffers[i].size;
3449 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_size + 1;
3450 tensor_blocks[tensor_block_size].head = ccv_array_new(sizeof(int), 1, 0);
3451 ccv_array_push(tensor_blocks[tensor_block_size].head, &idx);
3452 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3453 if (dup_p_refs && dup_p_refs->rnum > 0)
3454 {
3455 for (j = 0; j < dup_p_refs->rnum; j++)
3456 {
3457 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3458 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3458, __extension__ __PRETTY_FUNCTION__
); }))
;
3459 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3459, __extension__ __PRETTY_FUNCTION__
); }))
;
3460 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3460, __extension__ __PRETTY_FUNCTION__); }))
;
3461 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3462 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3463 if (tensor_symbol_info[dup_p_ref].p_ref)
3464 {
3465 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3466 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3466, __extension__ __PRETTY_FUNCTION__); }))
;
3467 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3468 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3469 {
3470 if (!tensor_blocks[tensor_block_size].dup_p_refs)
3471 tensor_blocks[tensor_block_size].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3472 ccv_array_add_unique_int(tensor_blocks[tensor_block_size].dup_p_refs, p_ref_0);
3473 }
3474 }
3475 if (!tensor_blocks[tensor_block_size].tail)
3476 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3477 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3478 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_size]);
3479 }
3480 } else {
3481 tensor_blocks[tensor_block_size].tail = ccv_array_new(sizeof(int), 1, 0);
3482 ccv_array_push(tensor_blocks[tensor_block_size].tail, &idx);
3483 }
3484 for (j = 0; j < source_size; j++)
3485 _ccv_nnc_tensor_block_add_exec(exec_dep, sources[j].d, tensor_blocks[tensor_block_size]);
3486 /* If this is a read-only (based on SSA, if first encountered as read), and this is
3487 * sub-graph. Mark it to the end of the graph. */
3488 if (p_exec_symbol_info)
3489 for (j = 0; j < destination_size; j++)
3490 _ccv_nnc_tensor_block_add_exec(exec_dep, destinations[j].d, tensor_blocks[tensor_block_size]);
3491 /* If it is read-only, it is self-reflecting. */
3492 for (k = 0; k < unroll_count; k++)
3493 {
3494 for (j = 0; j < destination_size; j++)
3495 if (dup_exec_ref[destinations[j].d * unroll_count + k] >= 0)
3496 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[destinations[j].d * unroll_count + k], tensor_blocks[tensor_block_size]);
3497 /* No need to extend life-time, because this is a sub-graph and we already extended read-only to the end of destination. */
3498 assert(symbolic_graph->p)((void) sizeof ((symbolic_graph->p) ? 1 : 0), __extension__
({ if (symbolic_graph->p) ; else __assert_fail ("symbolic_graph->p"
, "ccv_nnc_symbolic_graph_compile.c", 3498, __extension__ __PRETTY_FUNCTION__
); }))
;
3499 dup_tensor_block_ref[tensor_block_size * unroll_count + k] = tensor_block_size;
3500 }
3501 ++tensor_block_size;
3502 } else {
3503 ccv_array_t* const dup_p_refs = s_alloc_prep->buffers[i].dup_p_refs;
3504 const int tensor_block_idx = _ccv_nnc_anonymous_tensor_block_from_free_list(tensor_blocks, tensor_block_size, anonymous_block_free_list, anonymous_block_free_list_cap, s_alloc_prep->buffers[i].type, s_alloc_prep->buffers[i].size, exec_dep, dup_p_refs);
3505 const int new_anonymous_tensor_block = (tensor_block_idx == tensor_block_size);
3506 // Find suitable tensor block from the free list.
3507 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS))
;
3508 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3509 s_alloc_prep->buffers[i].p_refs[0] = tensor_block_idx + 1;
3510 if (new_anonymous_tensor_block)
3511 {
3512 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3513 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3514 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3515 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3516 ccv_array_push(tensor_blocks[tensor_block_idx].head, &idx);
3517 } else {
3518 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3519 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3520 }
3521 if (dup_p_refs && dup_p_refs->rnum > 0)
3522 {
3523 for (j = 0; j < dup_p_refs->rnum; j++)
3524 {
3525 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3526 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3526, __extension__ __PRETTY_FUNCTION__
); }))
;
3527 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3527, __extension__ __PRETTY_FUNCTION__
); }))
;
3528 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3529 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3530 if (tensor_symbol_info[dup_p_ref].p_ref)
3531 {
3532 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3533 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3533, __extension__ __PRETTY_FUNCTION__); }))
;
3534 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3535 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3536 {
3537 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3538 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3539 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3540 }
3541 }
3542 assert(tensor_blocks[dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_p_ref].tail) ? 1 : 0), __extension__
({ if (tensor_blocks[dup_p_ref].tail) ; else __assert_fail (
"tensor_blocks[dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3542, __extension__ __PRETTY_FUNCTION__); }))
;
3543 if (!tensor_blocks[tensor_block_idx].tail)
3544 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_p_ref].tail->rnum, 0);
3545 for (k = 0; k < tensor_blocks[dup_p_ref].tail->rnum; k++)
3546 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_p_ref].tail, k)((void*)(((char*)((tensor_blocks[dup_p_ref].tail)->data)) +
(size_t)(tensor_blocks[dup_p_ref].tail)->rsize * (size_t)
(k)))
, tensor_blocks[tensor_block_idx]);
3547 // We have to add it to the warp around companion_ref as well.
3548 // TODO: Although we know this wasted space (any space in between current one and its companion_ref will still
3549 // be occupied and unlikely to be reused), but we cannot really do too much about it because the companion_ref's
3550 // definition is too free-form and if we enforce stronger gaurantee on this (such as it must wrap around), this
3551 // gaurantee may be broken down in the line.
3552 if (tensor_blocks[dup_p_ref].companion_ref)
3553 {
3554 const int companion_ref = tensor_blocks[dup_p_ref].companion_ref - 1;
3555 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3556 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3557 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3558 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3559 }
3560 }
3561 } else if (new_anonymous_tensor_block) {
3562 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3563 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &idx);
3564 }
3565 const int prev_tensor_block_idx = tensor_block_idx;
3566 if (new_anonymous_tensor_block)
3567 {
3568 if (!anonymous_block_free_list)
3569 anonymous_block_free_list = ccv_array_new(sizeof(int), 0, 0);
3570 ccv_array_push(anonymous_block_free_list, &tensor_block_size);
3571 ++tensor_block_size;
3572 }
3573 for (k = 0; k < unroll_count; k++)
3574 {
3575 const int tensor_block_idx = new_anonymous_tensor_block ?
3576 (dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k] = tensor_block_size) :
3577 dup_tensor_block_ref[prev_tensor_block_idx * unroll_count + k];
3578 TENSOR_SET_ANONYMOUS(tensor_blocks[tensor_block_idx])(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0x10) | ANONYMOUS))
;
3579 TENSOR_SET_READ_WRITE(tensor_blocks[tensor_block_idx], TENSOR_READ_WRITE(s_alloc_prep->buffers[i]))(tensor_blocks[tensor_block_idx].flags = ((tensor_blocks[tensor_block_idx
].flags & ~0xc) | (s_alloc_prep->buffers[i].flags &
0xc)))
;
3580 if (new_anonymous_tensor_block)
3581 {
3582 tensor_blocks[tensor_block_idx].type = s_alloc_prep->buffers[i].type;
3583 tensor_blocks[tensor_block_idx].pin_mem = s_alloc_prep->buffers[i].pin_mem;
3584 tensor_blocks[tensor_block_idx].size = s_alloc_prep->buffers[i].size;
3585 tensor_blocks[tensor_block_idx].head = ccv_array_new(sizeof(int), 1, 0);
3586 /* Attach to duplicated exec for this tensor block. */
3587 ccv_array_push(tensor_blocks[tensor_block_idx].head, &dup_exec_ref[idx * unroll_count + k]);
3588 } else {
3589 tensor_blocks[tensor_block_idx].pin_mem = tensor_blocks[tensor_block_idx].pin_mem || s_alloc_prep->buffers[i].pin_mem;
3590 tensor_blocks[tensor_block_idx].size = ccv_max(tensor_blocks[tensor_block_idx].size, s_alloc_prep->buffers[i].size)({ typeof (tensor_blocks[tensor_block_idx].size) _a = (tensor_blocks
[tensor_block_idx].size); typeof (s_alloc_prep->buffers[i]
.size) _b = (s_alloc_prep->buffers[i].size); (_a > _b) ?
_a : _b; })
;
3591 _ccv_nnc_tensor_block_add_exec(exec_dep, dup_exec_ref[idx * unroll_count + k], tensor_blocks[tensor_block_idx]);
3592
3593 }
3594 if (dup_p_refs && dup_p_refs->rnum > 0)
3595 {
3596 /* Not nil, not self-reflecting. */
3597 for (j = 0; j < dup_p_refs->rnum; j++)
3598 {
3599 const int dup_p_ref = *(int*)ccv_array_get(dup_p_refs, j)((void*)(((char*)((dup_p_refs)->data)) + (size_t)(dup_p_refs
)->rsize * (size_t)(j)))
;
3600 assert(dup_p_ref >= 0)((void) sizeof ((dup_p_ref >= 0) ? 1 : 0), __extension__ (
{ if (dup_p_ref >= 0) ; else __assert_fail ("dup_p_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3600, __extension__ __PRETTY_FUNCTION__
); }))
;
3601 assert(dup_p_ref < symbolic_graph->tensor_symbol_info->rnum)((void) sizeof ((dup_p_ref < symbolic_graph->tensor_symbol_info
->rnum) ? 1 : 0), __extension__ ({ if (dup_p_ref < symbolic_graph
->tensor_symbol_info->rnum) ; else __assert_fail ("dup_p_ref < symbolic_graph->tensor_symbol_info->rnum"
, "ccv_nnc_symbolic_graph_compile.c", 3601, __extension__ __PRETTY_FUNCTION__
); }))
;
3602 // If it points to a p_ref upwards, check whether this is an output, if it is an output, add it to
3603 // this block's dup_p_refs. It propagates back all the way to upper layer's buffer object.
3604 if (tensor_symbol_info[dup_p_ref].p_ref)
3605 {
3606 const int p_ref_0 = tensor_symbol_info[dup_p_ref].p_ref - 1;
3607 assert(p_node_info)((void) sizeof ((p_node_info) ? 1 : 0), __extension__ ({ if (
p_node_info) ; else __assert_fail ("p_node_info", "ccv_nnc_symbolic_graph_compile.c"
, 3607, __extension__ __PRETTY_FUNCTION__); }))
;
3608 const int p_ref_0_is_in_or_out = _ccv_nnc_is_symbolic_graph_exec_input_or_output(p_ref_0, p_node_info);
3609 if (p_ref_0_is_in_or_out == 1) // If it is out tensor, mark dup_p_ref for this.
3610 {
3611 if (!tensor_blocks[tensor_block_idx].dup_p_refs)
3612 tensor_blocks[tensor_block_idx].dup_p_refs = ccv_array_new(sizeof(int), 1, 0);
3613 ccv_array_add_unique_int(tensor_blocks[tensor_block_idx].dup_p_refs, p_ref_0);
3614 }
3615 }
3616 assert(dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref)((void) sizeof ((dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count
+ k] != dup_p_ref) ? 1 : 0), __extension__ ({ if (dup_tensor_block_ref
[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref
[dup_p_ref * unroll_count + k] != dup_p_ref) ; else __assert_fail
("dup_tensor_block_ref[dup_p_ref * unroll_count + k] >= 0 && dup_tensor_block_ref[dup_p_ref * unroll_count + k] != dup_p_ref"
, "ccv_nnc_symbolic_graph_compile.c", 3616, __extension__ __PRETTY_FUNCTION__
); }))
;
3617 const int dup_dup_p_ref = dup_tensor_block_ref[dup_p_ref * unroll_count + k];
3618 assert(tensor_blocks[dup_dup_p_ref].tail)((void) sizeof ((tensor_blocks[dup_dup_p_ref].tail) ? 1 : 0),
__extension__ ({ if (tensor_blocks[dup_dup_p_ref].tail) ; else
__assert_fail ("tensor_blocks[dup_dup_p_ref].tail", "ccv_nnc_symbolic_graph_compile.c"
, 3618, __extension__ __PRETTY_FUNCTION__); }))
;
3619 if (!tensor_blocks[tensor_block_idx].tail)
3620 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), tensor_blocks[dup_dup_p_ref].tail->rnum, 0);
3621 for (q = 0; q < tensor_blocks[dup_dup_p_ref].tail->rnum; q++)
3622 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[dup_dup_p_ref].tail, q)((void*)(((char*)((tensor_blocks[dup_dup_p_ref].tail)->data
)) + (size_t)(tensor_blocks[dup_dup_p_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3623 // We have to add it to the warp around companion_ref as well.
3624 if (tensor_blocks[dup_dup_p_ref].companion_ref)
3625 {
3626 const int companion_ref = tensor_blocks[dup_dup_p_ref].companion_ref - 1;
3627 for (q = 0; tensor_blocks[companion_ref].head && q < tensor_blocks[companion_ref].head->rnum; q++)
3628 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].head, q)((void*)(((char*)((tensor_blocks[companion_ref].head)->data
)) + (size_t)(tensor_blocks[companion_ref].head)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3629 for (q = 0; tensor_blocks[companion_ref].tail && q < tensor_blocks[companion_ref].tail->rnum; q++)
3630 _ccv_nnc_tensor_block_add_exec(exec_dep, *(int*)ccv_array_get(tensor_blocks[companion_ref].tail, q)((void*)(((char*)((tensor_blocks[companion_ref].tail)->data
)) + (size_t)(tensor_blocks[companion_ref].tail)->rsize * (
size_t)(q)))
, tensor_blocks[tensor_block_idx]);
3631 }
3632 }
3633 } else if (new_anonymous_tensor_block) {
3634 tensor_blocks[tensor_block_idx].tail = ccv_array_new(sizeof(int), 1, 0);
3635 ccv_array_push(tensor_blocks[tensor_block_idx].tail, &dup_exec_ref[idx * unroll_count + k]);
3636 }
3637 if (new_anonymous_tensor_block)
3638 ++tensor_block_size;
3639 }
3640 }
3641 }
3642 }
3643 }
3644 } ccv_nnc_graph_visit_endfor} }
3645 if (anonymous_block_free_list)
3646 ccv_array_free(anonymous_block_free_list);
3647 ccfreefree(tensor_fold);
3648 // It is time to guess what's the best tensor placement and create the opaque tensor arena. The alloc_dep will return
3649 // the allocation dependencies, thus, which tensor is reused to the existing tensor.
3650 ccv_nnc_tensor_alloc_prep_t* alloc_prep = _ccv_nnc_tensor_alloc_prep_new_and_free_exec_dep(exec_dep, tensor_blocks, tensor_block_size);
3651 prep->while_count_tensor = 0;
3652 prep->dup_breakpoints = 0;
3653 prep->p = 0;
3654 prep->symbolic_graph = symbolic_graph;
3655 prep->p_idx = symbolic_graph->p_idx;
3656 prep->exec_idx = symbolic_graph->exec_idx;
3657 prep->sub_prep_size = symbolic_graph->sub_graphs ? symbolic_graph->sub_graphs->rnum : 0;
3658 prep->sub_preps = sub_preps;
3659 prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3660 prep->exec_symbol_info = exec_symbol_info;
3661 prep->tensor_symbol_info_size = symbolic_graph->tensor_symbol_info->rnum;
3662 prep->tensor_symbol_info = tensor_symbol_info;
3663 prep->unroll_count = unroll_count;
3664 prep->dup_tensor_block_ref = dup_tensor_block_ref;
3665 prep->tensor_block_size = tensor_block_size;
3666 prep->tensor_blocks = tensor_blocks;
3667 prep->exec_flags = exec_flags;
3668 prep->visit = visit;
3669 prep->alloc_prep = alloc_prep;
3670 if (dup_graph)
3671 ccv_nnc_symbolic_graph_free(dup_graph);
3672 if (dup_exec_ref)
3673 ccfreefree(dup_exec_ref);
3674 return prep;
3675}
3676
3677static void _ccv_nnc_symbolic_graph_prep_free(ccv_nnc_symbolic_graph_prep_t* const prep)
3678{
3679 int i;
3680 _ccv_nnc_tensor_blocks_free(prep->tensor_blocks, prep->tensor_block_size);
3681 ccfreefree(prep->exec_flags);
3682 for (i = 0; i < prep->sub_prep_size; i++)
3683 if (prep->sub_preps[i])
3684 _ccv_nnc_symbolic_graph_prep_free(prep->sub_preps[i]);
3685 if (prep->sub_preps)
3686 ccfreefree(prep->sub_preps);
3687 ccfreefree(prep->tensor_symbol_info);
3688 ccfreefree(prep->exec_symbol_info);
3689 if (prep->dup_tensor_block_ref)
3690 ccfreefree(prep->dup_tensor_block_ref);
3691 _ccv_nnc_tensor_alloc_prep_free(prep->alloc_prep);
3692 ccv_nnc_graph_visit_free(prep->visit);
3693 ccfreefree(prep);
3694}
3695
3696static void _ccv_nnc_symbolic_graph_prep_while_count_tensor(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
3697{
3698 int i, j;
3699 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
3700 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3701 {
3702 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3703 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3703, __extension__ __PRETTY_FUNCTION__
); }))
;
3704 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3705 for (i = 0; i < node->p_while.input_size; i++)
3706 if (CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(node->p_while.inputs[i])(((uint32_t)(node->p_while.inputs[i]) & 0xf) == 0xe))
3707 {
3708 ccv_nnc_symbolic_graph_prep_t* prep = sub_prep;
3709 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(node->p_while.inputs[i])((~(uint32_t)(node->p_while.inputs[i])) >> 4);
3710 for (j = 0; j < d; j++)
3711 prep = prep->p;
3712 prep->while_count_tensor = 1;
3713 }
3714 }
3715 for (i = 0; i < node->graph_ref_size; i++)
3716 {
3717 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3718 if (graph_ref >= 0)
3719 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep->sub_preps[graph_ref]);
3720 }
3721 } ccv_nnc_graph_visit_endfor} }
3722}
3723
3724static ccv_nnc_tensor_t* _ccv_nnc_tensor_from_graph_prep(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const int symbol)
3725{
3726 if (symbol >= 0)
3727 return graph_prep->tensor_arena->vt_tensors[symbol];
3728 if (symbol == CCV_NNC_NO_TENSOR_SYMBOL)
3729 return 0;
3730 assert(CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol))((void) sizeof (((((uint32_t)(symbol) & 0xf) == 0xe)) ? 1
: 0), __extension__ ({ if ((((uint32_t)(symbol) & 0xf) ==
0xe)) ; else __assert_fail ("CCV_NNC_IS_WHILE_COUNT_TENSOR_SYMBOL(symbol)"
, "ccv_nnc_symbolic_graph_compile.c", 3730, __extension__ __PRETTY_FUNCTION__
); }))
;
3731 const ccv_nnc_symbolic_graph_prep_t* prep = graph_prep;
3732 int i;
3733 const int d = CCV_NNC_DECODE_WHILE_COUNT_SYMBOL(symbol)((~(uint32_t)(symbol)) >> 4);
3734 for (i = 0; i < d; i++)
3735 prep = prep->p;
3736 assert(prep->while_count_tensor)((void) sizeof ((prep->while_count_tensor) ? 1 : 0), __extension__
({ if (prep->while_count_tensor) ; else __assert_fail ("prep->while_count_tensor"
, "ccv_nnc_symbolic_graph_compile.c", 3736, __extension__ __PRETTY_FUNCTION__
); }))
;
3737 return (ccv_nnc_tensor_t*)_ccv_nnc_tensor_metadata_get(prep->tensor_arena->tensor_metadata, (0 << 1) + 1);
3738}
3739
3740static void _ccv_nnc_graph_exec_arena_topsort(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
3741{
3742 int i;
3743 int* const exec_cvt = (int*)ccmallocmalloc(sizeof(int) * graph->exec_info->rnum);
3744 ccv_nnc_graph_topsort(graph, exec_cvt, graph->exec_info->rnum);
3745 graph_exec_arena->source.d = exec_cvt[graph_exec_arena->source.d];
3746 graph_exec_arena->destination.d = exec_cvt[graph_exec_arena->destination.d];
3747 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3748 for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
3749 if (graph_execs[i].graph == graph)
3750 graph_execs[i].d = exec_cvt[graph_execs[i].d];
3751 ccfreefree(exec_cvt);
3752}
3753
3754static ccv_nnc_graph_exec_arena_t* _ccv_nnc_graph_exec_arena_new(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_tensor_arena_t* const tensor_arena)
3755{
3756 int i, j, k;
3757 ccv_nnc_graph_t* const graph = graph_prep->graph;
3758 const int exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
3759 ccv_nnc_graph_exec_arena_t* const graph_exec_arena = (ccv_nnc_graph_exec_arena_t*)ccmallocmalloc(sizeof(ccv_nnc_graph_exec_arena_t) + sizeof(ccv_nnc_graph_exec_arena_t*) * graph_prep->sub_prep_size + sizeof(ccv_nnc_graph_exec_t) * (exec_symbol_info_size - 1));
3760 graph_exec_arena->graph_ref = (intptr_t)symbolic_graph;
3761 graph_exec_arena->graph_exec_size = exec_symbol_info_size;
3762 graph_exec_arena->sub_arena_size = graph_prep->sub_prep_size;
3763 graph_exec_arena->sub_arenas = (ccv_nnc_graph_exec_arena_t**)(graph_exec_arena->graph_execs + exec_symbol_info_size);
3764 memset(graph_exec_arena->sub_arenas, 0, sizeof(ccv_nnc_graph_exec_arena_t*) * graph_exec_arena->sub_arena_size);
3765 ccv_nnc_graph_exec_t* const graph_execs = graph_exec_arena->graph_execs;
3766 int max_input_size = 0, max_output_size = 0, max_breakpoint_size = 0;
3767 for (i = 0; i < exec_symbol_info_size; i++)
3768 {
3769 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].input_size) _b = (graph_prep->exec_symbol_info
[i].input_size); (_a > _b) ? _a : _b; })
;
3770 max_output_size = ccv_max(max_output_size, graph_prep->exec_symbol_info[i].output_size)({ typeof (max_output_size) _a = (max_output_size); typeof (graph_prep
->exec_symbol_info[i].output_size) _b = (graph_prep->exec_symbol_info
[i].output_size); (_a > _b) ? _a : _b; })
;
3771 if (graph_prep->exec_symbol_info[i].flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3772 max_input_size = ccv_max(max_input_size, graph_prep->exec_symbol_info[i].p_while.input_size)({ typeof (max_input_size) _a = (max_input_size); typeof (graph_prep
->exec_symbol_info[i].p_while.input_size) _b = (graph_prep
->exec_symbol_info[i].p_while.input_size); (_a > _b) ? _a
: _b; })
;
3773 graph_execs[i].d = CCV_NNC_NO_TENSOR_SYMBOL;
3774 graph_execs[i].graph = 0;
3775 }
3776 for (i = 0; i < graph_prep->sub_prep_size; i++)
3777 max_breakpoint_size = ccv_max(max_breakpoint_size, (*(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, i))->breakpoint_size)({ typeof (max_breakpoint_size) _a = (max_breakpoint_size); typeof
((*(ccv_nnc_symbolic_graph_t**)((void*)(((char*)((symbolic_graph
->sub_graphs)->data)) + (size_t)(symbolic_graph->sub_graphs
)->rsize * (size_t)(i))))->breakpoint_size) _b = ((*(ccv_nnc_symbolic_graph_t
**)((void*)(((char*)((symbolic_graph->sub_graphs)->data
)) + (size_t)(symbolic_graph->sub_graphs)->rsize * (size_t
)(i))))->breakpoint_size); (_a > _b) ? _a : _b; })
;
3778 ccv_nnc_tensor_t* max_inputs[ccv_max(1, max_input_size)({ typeof (1) _a = (1); typeof (max_input_size) _b = (max_input_size
); (_a > _b) ? _a : _b; })
];
3779 ccv_nnc_tensor_t* max_outputs[ccv_max(1, max_output_size)({ typeof (1) _a = (1); typeof (max_output_size) _b = (max_output_size
); (_a > _b) ? _a : _b; })
];
3780 ccv_nnc_graph_exec_t max_breakpoints[ccv_max(1, max_breakpoint_size)({ typeof (1) _a = (1); typeof (max_breakpoint_size) _b = (max_breakpoint_size
); (_a > _b) ? _a : _b; })
];
3781 const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = graph_prep->exec_symbol_info;
3782 const ccv_nnc_graph_exec_flag_t* const exec_flags = graph_prep->exec_flags;
3783 // Create node, this is in topological order.
3784 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
3785 if (CCV_NO_GRAPH_EXEC(graph_execs[idx])((graph_execs[idx]).graph == 0))
3786 {
3787 for (i = 0; i < node->input_size; i++)
3788 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(graph_prep, node->inputs[i]);
3789 for (i = 0; i < node->output_size; i++)
3790 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3791 if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
3792 {
3793 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[0] - 1;
3794 assert(graph_ref >= 0)((void) sizeof ((graph_ref >= 0) ? 1 : 0), __extension__ (
{ if (graph_ref >= 0) ; else __assert_fail ("graph_ref >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3794, __extension__ __PRETTY_FUNCTION__
); }))
;
3795 ccv_nnc_symbolic_graph_prep_t* const sub_prep = graph_prep->sub_preps[graph_ref];
3796 ccv_nnc_graph_t* const sub_graph = sub_prep->graph;
3797 graph_execs[idx] = ccv_nnc_graph_while(graph, node->cmd.cmd, sub_graph);
3798 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3799 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3800 ccv_nnc_graph_exec_set_io(graph, graph_execs[idx], max_inputs, node->input_size, max_outputs, node->output_size);
3801 for (i = 0; i < node->p_while.input_size; i++)
3802 max_inputs[i] = _ccv_nnc_tensor_from_graph_prep(sub_prep, node->p_while.inputs[i]);
3803 for (i = 0; i < sub_symbolic_graph->breakpoint_size; i++)
3804 max_breakpoints[i] = ccv_nnc_graph_exec_from_symbol(sub_arena, sub_symbolic_graph->breakpoints[i]);
3805 ccv_nnc_graph_set_while_expr(sub_graph, node->p_while.expr, node->p_while.data, max_inputs, node->p_while.input_size, max_breakpoints, sub_symbolic_graph->breakpoint_size);
3806 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3807 } else if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) {
3808 for (i = 0; i < node->output_size; i++)
3809 if (max_outputs[i] && max_outputs[i]->alias_ref)
3810 max_outputs[i] = (ccv_nnc_tensor_t*)max_outputs[i]->alias_ref;
3811 graph_execs[idx] = ccv_nnc_graph_case_of_new(graph, node->cmd.cmd, max_inputs + node->case_of.argument.offset, node->case_of.argument.size, max_outputs, node->output_size);
3812 // Check whether this is already covered in the inputs, if not, need to be covered in the update.
3813 for (i = 0; i < node->case_of.argument.offset; i++)
3814 {
3815 ccv_nnc_tensor_t* const update = max_inputs[i];
3816 if (!CCV_IS_TENSOR_MULTIVIEW(update)((*(int*)(update)) & CCV_TENSOR_MULTIVIEW)) // No need if it is a naked tensor.
3817 continue;
3818 int flag = 0;
3819 for (j = node->case_of.argument.offset; !flag && j < node->case_of.argument.size; j++)
3820 flag = (update == max_inputs[j]);
3821 if (!flag)
3822 ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], update);
3823 }
3824 const int offset = (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO) ? 1 : 0;
3825 ccv_nnc_graph_set_case_of_expr(graph, graph_execs[idx], node->case_of.expr, node->case_of.data, offset);
3826 if (exec_flags[idx].flags & CCV_NNC_GRAPH_EXEC_ATTR_CASE_OF_NO_BYPASS_IO)
3827 {
3828 // Add another graph for data transfer.
3829 ccv_nnc_graph_t* sub_graph = ccv_nnc_graph_new();
3830 for (i = 0; i < node->output_size; i++)
3831 max_outputs[i] = node->outputs[i] >= 0 ? tensor_arena->vt_tensors[node->outputs[i]] : 0;
3832 ccv_nnc_graph_exec_t io = ccv_nnc_graph_exec_new(sub_graph, ccv_nnc_cmd(CCV_NNC_DATA_TRANSFER_FORWARD, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, max_inputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
, max_outputs, ccv_min(node->input_size, node->output_size)({ typeof (node->input_size) _a = (node->input_size); typeof
(node->output_size) _b = (node->output_size); (_a <
_b) ? _a : _b; })
);
3833 ccv_nnc_graph_set_sources(sub_graph, &io, 1);
3834 ccv_nnc_graph_set_destinations(sub_graph, &io, 1);
3835 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, 0);
3836 int exec_cvt;
3837 ccv_nnc_graph_topsort(sub_graph, &exec_cvt, 1);
3838 }
3839 for (i = 0; i < node->graph_ref_size; i++)
3840 {
3841 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
3842 if (graph_ref < 0)
3843 continue;
3844 ccv_nnc_graph_t* const sub_graph = graph_prep->sub_preps[graph_ref]->graph;
3845 const ccv_nnc_symbolic_graph_t* const sub_symbolic_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(symbolic_graph->sub_graphs, graph_ref)((void*)(((char*)((symbolic_graph->sub_graphs)->data)) +
(size_t)(symbolic_graph->sub_graphs)->rsize * (size_t)
(graph_ref)))
;
3846 ccv_nnc_graph_exec_arena_t* const sub_arena = graph_exec_arena->sub_arenas[graph_ref] = _ccv_nnc_graph_exec_arena_new(sub_symbolic_graph, ccv_nnc_symbolic_graph_sources(sub_symbolic_graph), ccv_nnc_symbolic_graph_source_size(sub_symbolic_graph), ccv_nnc_symbolic_graph_destinations(sub_symbolic_graph), ccv_nnc_symbolic_graph_destination_size(sub_symbolic_graph), graph_prep->sub_preps[graph_ref], tensor_arena->sub_arenas[graph_ref]);
3847 ccv_nnc_graph_set_case_of(graph, graph_execs[idx], sub_graph, i + offset);
3848 _ccv_nnc_graph_exec_arena_topsort(sub_graph, sub_arena);
3849 }
3850 } else {
3851 graph_execs[idx] = ccv_nnc_graph_exec_new(graph, node->cmd, node->hint, max_inputs, node->input_size, max_outputs, node->output_size);
3852 }
3853 ccv_nnc_graph_exec_set_io_flags(graph, graph_execs[idx], 0, 0, 0, 0);
3854 }
3855 } ccv_nnc_graph_visit_endfor} }
3856 // Then connect them.
3857 ccv_nnc_graph_visit_for(graph_prep->visit, exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((exec_symbol_info)) const node __attribute__((unused)) = (exec_symbol_info
) + idx;
{
3858 if (node->outgoings)
3859 for (i = 0; i < node->outgoings->rnum; i++)
3860 {
3861 const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)(
node->outgoings)->rsize * (size_t)(i)))
;
3862 if (graph_execs[outgoing].graph)
3863 ccv_nnc_graph_exec_concat(graph, graph_execs[idx], graph_execs[outgoing]);
3864 }
3865 } ccv_nnc_graph_visit_endfor} }
3866 int source_exec_created = 0;
3867 const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = graph_prep->tensor_symbol_info;
3868 const ccv_nnc_tensor_block_t* const tensor_blocks = graph_prep->tensor_blocks;
3869 ccv_array_t* const* const alloc_dep = graph_prep->alloc_prep->alloc_dep;
3870 // After the graph is materialized, we need to handle the case that some of these tensors require to be initialized to zero before use.
3871 for (i = 0; i < symbolic_graph->tensor_symbol_info->rnum; i++)
3872 {
3873 if (TENSOR_REQUIRE_INIT(tensor_symbol_info[i].flags)(((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS
) || ((tensor_symbol_info[i].flags) & CCV_NNC_TENSOR_SYMBOL_INIT_ONES
))
)
3874 {
3875 int ref = i;
3876 while (tensor_symbol_info[ref].alias_ref)
3877 ref = tensor_symbol_info[ref].alias_ref - 1;
3878 while (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
&& tensor_blocks[ref].ref)
3879 ref = tensor_blocks[ref].ref - 1;
3880 // This is not computable. It could be that we marked a const tensor as init zero.
3881 if (!TENSOR_EXPECT_COMPUTABLE(tensor_blocks[ref])(!((tensor_blocks[ref].flags & 0x3) == ALIAS) && !
((tensor_blocks[ref].flags & 0x3) == UNASSIGNED))
)
3882 continue;
3883 // If this tensor is not used by any exec, we don't need to init at all. Skip.
3884 if (!tensor_blocks[ref].head || tensor_blocks[ref].head->rnum == 0)
3885 continue;
3886 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[ref];
3887 // Now, we have the original tensor, we can get the actual tensor, and construct the set command.
3888 ccv_nnc_graph_exec_t set_exec;
3889 if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS)
3890 set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(0)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={0,}}}, 0)
, ccv_nnc_no_hint, 0, 0, &tensor, 1);
3891 else if (tensor_symbol_info[i].flags & CCV_NNC_TENSOR_SYMBOL_INIT_ONES)
3892 set_exec = ccv_nnc_graph_exec_new(graph, CMD_SET_FORWARD(1)ccv_nnc_cmd(CCV_NNC_SET_FORWARD, 0, (ccv_nnc_cmd_param_t){.size
={.dim={1,1,1}},.blas={.a={1,}}}, 0)
, ccv_nnc_no_hint, 0, 0, &tensor, 1);
3893 for (j = 0; j < tensor_blocks[ref].head->rnum; j++)
3894 {
3895 const int outgoing = *(int*)ccv_array_get(tensor_blocks[ref].head, j)((void*)(((char*)((tensor_blocks[ref].head)->data)) + (size_t
)(tensor_blocks[ref].head)->rsize * (size_t)(j)))
;
3896 if (outgoing >= exec_symbol_info_size)
3897 continue;
3898 assert(outgoing >= 0)((void) sizeof ((outgoing >= 0) ? 1 : 0), __extension__ ({
if (outgoing >= 0) ; else __assert_fail ("outgoing >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3898, __extension__ __PRETTY_FUNCTION__
); }))
;
3899 assert(graph_execs[outgoing].graph)((void) sizeof ((graph_execs[outgoing].graph) ? 1 : 0), __extension__
({ if (graph_execs[outgoing].graph) ; else __assert_fail ("graph_execs[outgoing].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3899, __extension__ __PRETTY_FUNCTION__
); }))
;
3900 ccv_nnc_graph_exec_concat(graph, set_exec, graph_execs[outgoing]);
3901 }
3902 int flags = 0;
3903 if (alloc_dep[ref])
3904 for (j = 0; j < alloc_dep[ref]->rnum; j++)
3905 {
3906 const int d = *(int*)ccv_array_get(alloc_dep[ref], j)((void*)(((char*)((alloc_dep[ref])->data)) + (size_t)(alloc_dep
[ref])->rsize * (size_t)(j)))
;
3907 // This is from alloc_dep, it should be computable.
3908 assert(TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d]))((void) sizeof (((!((tensor_blocks[d].flags & 0x3) == ALIAS
) && !((tensor_blocks[d].flags & 0x3) == UNASSIGNED
))) ? 1 : 0), __extension__ ({ if ((!((tensor_blocks[d].flags
& 0x3) == ALIAS) && !((tensor_blocks[d].flags &
0x3) == UNASSIGNED))) ; else __assert_fail ("TENSOR_EXPECT_COMPUTABLE(tensor_blocks[d])"
, "ccv_nnc_symbolic_graph_compile.c", 3908, __extension__ __PRETTY_FUNCTION__
); }))
;
3909 if (tensor_blocks[d].tail)
3910 for (k = 0; k < tensor_blocks[d].tail->rnum; k++)
3911 {
3912 const int incoming = *(int*)ccv_array_get(tensor_blocks[d].tail, k)((void*)(((char*)((tensor_blocks[d].tail)->data)) + (size_t
)(tensor_blocks[d].tail)->rsize * (size_t)(k)))
;
3913 if (incoming >= exec_symbol_info_size)
3914 continue;
3915 assert(incoming >= 0)((void) sizeof ((incoming >= 0) ? 1 : 0), __extension__ ({
if (incoming >= 0) ; else __assert_fail ("incoming >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 3915, __extension__ __PRETTY_FUNCTION__
); }))
;
3916 assert(graph_execs[incoming].graph)((void) sizeof ((graph_execs[incoming].graph) ? 1 : 0), __extension__
({ if (graph_execs[incoming].graph) ; else __assert_fail ("graph_execs[incoming].graph"
, "ccv_nnc_symbolic_graph_compile.c", 3916, __extension__ __PRETTY_FUNCTION__
); }))
;
3917 ccv_nnc_graph_exec_concat(graph, graph_execs[incoming], set_exec);
3918 flags = 1;
3919 }
3920 }
3921 // If cannot find a start node for this exec, we need to append it to the no-op of the start.
3922 if (!flags)
3923 {
3924 if (!source_exec_created)
3925 {
3926 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3927 source_exec_created = 1;
3928 }
3929 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, set_exec);
3930 }
3931 }
3932 }
3933 // Now go through the list of tensors to see whether we need to do explicit broadcast for these tensor multi-views
3934 // (we need that if it is not associated as inputs / outputs of any execs, this is possible if all execs associate
3935 // with its alias).
3936 assert(tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size)((void) sizeof ((tensor_arena->vt_tensor_size == graph_prep
->tensor_symbol_info_size) ? 1 : 0), __extension__ ({ if (
tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size
) ; else __assert_fail ("tensor_arena->vt_tensor_size == graph_prep->tensor_symbol_info_size"
, "ccv_nnc_symbolic_graph_compile.c", 3936, __extension__ __PRETTY_FUNCTION__
); }))
;
3937 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
3938 {
3939 ccv_nnc_tensor_t* mv = tensor_arena->vt_tensors[i];
3940 // If it is multiview tensor, inspect all its head to see whether we already associated with the node.
3941 if (mv && CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
3942 {
3943 const ccv_array_t* const head = tensor_blocks[i].head;
3944 if (head && head->rnum > 0)
3945 for (j = 0; j < head->rnum; j++)
3946 {
3947 const int idx = *(int*)ccv_array_get(head, j)((void*)(((char*)((head)->data)) + (size_t)(head)->rsize
* (size_t)(j)))
;
3948 if (idx >= exec_symbol_info_size)
3949 continue;
3950 assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if (
idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 3950, __extension__ __PRETTY_FUNCTION__); }))
;
3951 const int d = graph_execs[idx].d;
3952 ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, d)((void*)(((char*)((graph->exec_info)->data)) + (size_t)
(graph->exec_info)->rsize * (size_t)(d)))
;
3953 int flag = 0;
3954 if (exec_info->tensor_wraps_ref)
3955 {
3956 ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, exec_info->tensor_wraps_ref - 1)((void*)(((char*)((graph->tensor_wraps)->data)) + (size_t
)(graph->tensor_wraps)->rsize * (size_t)(exec_info->
tensor_wraps_ref - 1)))
;
3957 for (k = 0; k < tensor_wrap_array->size && !flag; k++)
3958 flag = (tensor_wrap_array->tensor_wraps[k] && tensor_wrap_array->tensor_wraps[k]->tensors[0] == mv);
3959 }
3960 // If none is in the flag, it need to be included in the cast.
3961 if (!flag)
3962 ccv_nnc_graph_exec_add_as_affected(graph, graph_execs[idx], mv);
3963 }
3964 }
3965 }
3966 // Create source / destination phony node. This is to facilitate use of compiled graph.
3967 // Also, this is needed if you have init zero execs.
3968 if (source_exec_created || source_size > 1)
3969 {
3970 if (!source_exec_created)
3971 graph_exec_arena->source = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3972 for (i = 0; i < source_size; i++)
3973 ccv_nnc_graph_exec_concat(graph, graph_exec_arena->source, graph_execs[sources[i].d]);
3974 } else {
3975 assert(!source_exec_created)((void) sizeof ((!source_exec_created) ? 1 : 0), __extension__
({ if (!source_exec_created) ; else __assert_fail ("!source_exec_created"
, "ccv_nnc_symbolic_graph_compile.c", 3975, __extension__ __PRETTY_FUNCTION__
); }))
;
3976 assert(source_size == 1)((void) sizeof ((source_size == 1) ? 1 : 0), __extension__ ({
if (source_size == 1) ; else __assert_fail ("source_size == 1"
, "ccv_nnc_symbolic_graph_compile.c", 3976, __extension__ __PRETTY_FUNCTION__
); }))
;
3977 graph_exec_arena->source = graph_execs[sources[0].d];
3978 }
3979 if (destination_size == 1)
3980 graph_exec_arena->destination = graph_execs[destinations[0].d];
3981 else {
3982 graph_exec_arena->destination = ccv_nnc_graph_exec_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC()((ccv_nnc_cmd_param_t){.size={.dim={1,1,1}}}), 0), ccv_nnc_no_hint, 0, 0, 0, 0);
3983 for (i = 0; i < destination_size; i++)
3984 ccv_nnc_graph_exec_concat(graph, graph_execs[destinations[i].d], graph_exec_arena->destination);
3985 }
3986 ccv_nnc_graph_set_sources(graph, &graph_exec_arena->source, 1);
3987 ccv_nnc_graph_set_destinations(graph, &graph_exec_arena->destination, 1);
3988 return graph_exec_arena;
3989}
3990
3991static ccv_nnc_graph_t* _ccv_nnc_graph_find_pair(const ccv_nnc_symbolic_graph_prep_t* const graph_prep, const ccv_nnc_symbolic_graph_t* const pair)
3992{
3993 if (graph_prep->symbolic_graph == pair)
3994 return graph_prep->graph;
3995 int i;
3996 for (i = 0; i < graph_prep->sub_prep_size; i++)
3997 if (graph_prep->sub_preps[i])
3998 {
3999 ccv_nnc_graph_t* const graph = _ccv_nnc_graph_find_pair(graph_prep->sub_preps[i], pair);
4000 if (graph)
4001 return graph;
4002 }
4003 return 0;
4004}
4005
4006static void _ccv_nnc_graph_fixup_pair(const ccv_nnc_symbolic_graph_prep_t* const root_prep, ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4007{
4008 int i;
4009 for (i = 0; i < graph_prep->sub_prep_size; i++)
4010 if (graph_prep->sub_preps[i])
4011 {
4012 if (graph_prep->sub_preps[i]->symbolic_graph->pair)
4013 graph_prep->sub_preps[i]->graph->pair = _ccv_nnc_graph_find_pair(root_prep, graph_prep->sub_preps[i]->symbolic_graph->pair);
4014 }
4015}
4016
4017static void _ccv_nnc_graph_exec_arena_fixup_pair_ref(const ccv_nnc_graph_exec_arena_t* const root_arena, const ccv_nnc_symbolic_graph_prep_t* const graph_prep, ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4018{
4019 assert(graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph)((void) sizeof ((graph_exec_arena->graph_ref == (intptr_t)
graph_prep->symbolic_graph) ? 1 : 0), __extension__ ({ if (
graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph
) ; else __assert_fail ("graph_exec_arena->graph_ref == (intptr_t)graph_prep->symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4019, __extension__ __PRETTY_FUNCTION__
); }))
;
4020 int i;
4021 for (i = 0; i < graph_prep->exec_symbol_info_size; i++)
4022 {
4023 if (CCV_NNC_GRAPH_EXEC_IS_DEAD(graph_prep->exec_symbol_info[i].flags)((graph_prep->exec_symbol_info[i].flags) & CCV_NNC_GRAPH_EXEC_DEAD
)
)
4024 continue;
4025 if (graph_exec_arena->graph_execs[i].graph && graph_prep->exec_symbol_info[i].pair_ref)
4026 {
4027 ccv_nnc_graph_exec_t pair_exec = ccv_nnc_graph_exec_from_symbol(root_arena, (ccv_nnc_graph_exec_symbol_t){
4028 .d = graph_prep->exec_symbol_info[i].pair_ref - 1,
4029 .graph = graph_prep->symbolic_graph->pair ? graph_prep->symbolic_graph->pair : graph_prep->symbolic_graph,
4030 });
4031 if (pair_exec.d >= 0)
4032 ccv_nnc_graph_exec_pair_with(graph_prep->graph, graph_exec_arena->graph_execs[i], pair_exec);
4033 }
4034 }
4035 for (i = 0; i < graph_prep->sub_prep_size; i++)
4036 if (graph_prep->sub_preps[i])
4037 _ccv_nnc_graph_exec_arena_fixup_pair_ref(root_arena, graph_prep->sub_preps[i], graph_exec_arena->sub_arenas[i]);
4038}
4039
4040static void _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(ccv_nnc_symbolic_graph_prep_t* const graph_prep)
4041{
4042 int i;
4043 if (graph_prep->dup_breakpoints)
4044 {
4045 // Strip the const modifier only possible because it is a sub-graph.
4046 ccv_nnc_symbolic_graph_t* const symbolic_graph = (ccv_nnc_symbolic_graph_t*)graph_prep->symbolic_graph;
4047 for (i = 0; i < graph_prep->dup_breakpoints->rnum; i++)
4048 ccv_nnc_graph_exec_symbol_free(symbolic_graph, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(graph_prep->dup_breakpoints, i)((void*)(((char*)((graph_prep->dup_breakpoints)->data))
+ (size_t)(graph_prep->dup_breakpoints)->rsize * (size_t
)(i)))
);
4049 ccv_array_free(graph_prep->dup_breakpoints);
4050 graph_prep->dup_breakpoints = 0;
4051 graph_prep->exec_symbol_info_size = symbolic_graph->exec_symbol_info->rnum;
4052 // Afterwards, we have to regenerate the exec_symbol_info, fill in the information (through symbol_infer).
4053 memcpy(graph_prep->exec_symbol_info, ccv_array_get(symbolic_graph->exec_symbol_info, 0)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0)))
, sizeof(ccv_nnc_graph_exec_symbol_info_t) * graph_prep->exec_symbol_info_size);
4054 // Since exec_symbol_info changed, create a new visit object.
4055 assert(symbolic_graph->sources)((void) sizeof ((symbolic_graph->sources) ? 1 : 0), __extension__
({ if (symbolic_graph->sources) ; else __assert_fail ("symbolic_graph->sources"
, "ccv_nnc_symbolic_graph_compile.c", 4055, __extension__ __PRETTY_FUNCTION__
); }))
;
4056 assert(symbolic_graph->destinations)((void) sizeof ((symbolic_graph->destinations) ? 1 : 0), __extension__
({ if (symbolic_graph->destinations) ; else __assert_fail
("symbolic_graph->destinations", "ccv_nnc_symbolic_graph_compile.c"
, 4056, __extension__ __PRETTY_FUNCTION__); }))
;
4057 ccv_nnc_graph_exec_symbol_t* const sources = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->sources, 0)((void*)(((char*)((symbolic_graph->sources)->data)) + (
size_t)(symbolic_graph->sources)->rsize * (size_t)(0)))
;
4058 const int source_size = symbolic_graph->sources->rnum;
4059 ccv_nnc_graph_exec_symbol_t* const destinations = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(symbolic_graph->destinations, 0)((void*)(((char*)((symbolic_graph->destinations)->data)
) + (size_t)(symbolic_graph->destinations)->rsize * (size_t
)(0)))
;
4060 const int destination_size = symbolic_graph->destinations->rnum;
4061 ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new(symbolic_graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, 0), symbolic_graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc
(sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) *
((symbolic_graph->exec_symbol_info->rnum) - 1)); _visit_
->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t
c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_
= 0; for (_i_ = 0; _i_ < (symbolic_graph->exec_symbol_info
->rnum); _i_++) _incoming_edges_ += (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings) ? ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_i_].outgoings->rnum : 0; const int _heap_mem_
= ((symbolic_graph->exec_symbol_info->rnum) + _incoming_edges_
> 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_)
_incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t
) * (symbolic_graph->exec_symbol_info->rnum) + sizeof(int32_t
) * ((symbolic_graph->exec_symbol_info->rnum) * 2 + _incoming_edges_
)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (
sizeof(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info
->rnum) + sizeof(int32_t) * ((symbolic_graph->exec_symbol_info
->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof
(ccv_nnc_incoming_t) * (symbolic_graph->exec_symbol_info->
rnum)); int32_t* _exists_[2] = { (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)), (int32_t*)(_incomings_ + (symbolic_graph
->exec_symbol_info->rnum)) + (symbolic_graph->exec_symbol_info
->rnum), }; int32_t* const _edges_ = _exists_[1] + (symbolic_graph
->exec_symbol_info->rnum); for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _incomings_[(sources)[_i_].d].r = 1; _exists_[0][_i_]
= (sources)[_i_].d; } int _exist_size_[2] = { (source_size),
0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 1) continue; _incomings_[_idx_].r = 2
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_
[d].r = 1; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (source_size); _i_++) { ((void) sizeof ((
(sources)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((sources)[_i_].graph == symbolic_graph) ; else __assert_fail
("(sources)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4061, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(sources)[_i_].d].r = 3; _exists_[0][_i_] = (sources)[_i_].d;
} _exist_size_[0] = (source_size); _exist_size_[1] = 0; _p_ =
0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0)
{ _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_
[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (
_incomings_[_idx_].r != 3) continue; _incomings_[_idx_].r = 4
; if (((ccv_nnc_graph_exec_symbol_info_t*)((void*)(((char*)((
symbolic_graph->exec_symbol_info)->data)) + (size_t)(symbolic_graph
->exec_symbol_info)->rsize * (size_t)(0))))[_idx_].outgoings
) for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t*)
((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_
; _bump_ += _incomings_[d].c; _incomings_[d].c = 0; } _edges_
[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_
[d].c; if (_incomings_[d].r != 2) continue; _incomings_[d].r =
3; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph->
exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_
[_q_] < (symbolic_graph->exec_symbol_info->rnum)) ; else
__assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4061, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].r = 5; _exists_[0][_i_] = (destinations
)[_i_].d; } _exist_size_[0] = (destination_size); _exist_size_
[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) {
_exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_
]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_
[_idx_].r != 5) continue; _incomings_[_idx_].r = 6; if (_incomings_
[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_
].c; _j_++) { const int d = _edges_[_incomings_[_idx_].edges -
1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d
].r = 5; ((void) sizeof ((_exist_size_[_q_] < (symbolic_graph
->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if
(_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for
(_i_ = 0; _i_ < (destination_size); _i_++) { ((void) sizeof
(((destinations)[_i_].graph == symbolic_graph) ? 1 : 0), __extension__
({ if ((destinations)[_i_].graph == symbolic_graph) ; else __assert_fail
("(destinations)[_i_].graph == symbolic_graph", "ccv_nnc_symbolic_graph_compile.c"
, 4061, __extension__ __PRETTY_FUNCTION__); })); _incomings_[
(destinations)[_i_].d].d = 1; } for (_i_ = 0; _i_ < (source_size
); _i_++) { ((void) sizeof (((sources)[_i_].graph == symbolic_graph
) ? 1 : 0), __extension__ ({ if ((sources)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(sources)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[0][_i_] = (sources)[_i_].d; } _p_ = 0; _q_ =
1; _exist_size_[0] = (source_size); _exist_size_[1] = 0; int
_d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_
] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_];) { const int32_t
_idx_ = _exists_[_p_][_i_]; _visit_->node[_visit_->size
].index = ((_idx_)); _visit_->node[_visit_->size].term =
((_incomings_[_idx_].d)); ++_visit_->size;; if (_incomings_
[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings) { if (((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum == 1) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(0)))
; --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { _exists_
[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < ((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings->rnum; _j_++) { const int
d = *(int*)((void*)(((char*)((((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->data)) + (size_t)(((ccv_nnc_graph_exec_symbol_info_t
*)((void*)(((char*)((symbolic_graph->exec_symbol_info)->
data)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize
* (size_t)(0))))[_idx_].outgoings)->rsize * (size_t)(_j_)
)); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_
[d].r == 6 && _d_ < (destination_size)) { ((void) sizeof
((_exist_size_[_q_] < (symbolic_graph->exec_symbol_info
->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] <
(symbolic_graph->exec_symbol_info->rnum)) ; else __assert_fail
("_exist_size_[_q_] < (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[
_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (
_i_)); } for (_i_ = 0; _i_ < (destination_size); _i_++) { (
(void) sizeof (((destinations)[_i_].graph == symbolic_graph) ?
1 : 0), __extension__ ({ if ((destinations)[_i_].graph == symbolic_graph
) ; else __assert_fail ("(destinations)[_i_].graph == symbolic_graph"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); if (_incomings_[(destinations)[_i_].d].r == 7) continue
; if (!(0)) { ((void) sizeof ((_incomings_[(destinations)[_i_
].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[(destinations
)[_i_].d].c == 0) ; else __assert_fail ("_incomings_[(destinations)[_i_].d].c == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); } else if (_incomings_[(destinations)[_i_].d].c > 0
) continue; _visit_->node[_visit_->size].index = (((destinations
)[_i_].d)); _visit_->node[_visit_->size].term = ((_incomings_
[(destinations)[_i_].d].d)); ++_visit_->size;; } if (_heap_mem_
) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->
size <= (symbolic_graph->exec_symbol_info->rnum)) ? 1
: 0), __extension__ ({ if (_visit_->size <= (symbolic_graph
->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (symbolic_graph->exec_symbol_info->rnum)"
, "ccv_nnc_symbolic_graph_compile.c", 4061, __extension__ __PRETTY_FUNCTION__
); })); _visit_; })
;
4062 ccv_nnc_graph_visit_free(graph_prep->visit);
4063 graph_prep->visit = visit;
4064 assert(graph_prep->p)((void) sizeof ((graph_prep->p) ? 1 : 0), __extension__ ({
if (graph_prep->p) ; else __assert_fail ("graph_prep->p"
, "ccv_nnc_symbolic_graph_compile.c", 4064, __extension__ __PRETTY_FUNCTION__
); }))
;
4065 ccv_nnc_symbolic_graph_symbol_infer(symbolic_graph, visit, sources, source_size, destinations, destination_size, graph_prep->p->tensor_symbol_info, graph_prep->p->tensor_symbol_info_size, graph_prep->tensor_symbol_info, graph_prep->exec_symbol_info);
4066 }
4067 ccv_nnc_graph_visit_for(graph_prep->visit, graph_prep->exec_symbol_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (graph_prep->visit)->
size; _i_++) { const int idx __attribute__((unused)) = (graph_prep
->visit)->node[_i_].index; const int _node_unused_ __attribute__
((unused)) = (graph_prep->visit)->node[_i_].term; typeof
((graph_prep->exec_symbol_info)) const node __attribute__
((unused)) = (graph_prep->exec_symbol_info) + idx;
{
4068 for (i = 0; i < node->graph_ref_size; i++)
4069 {
4070 const int graph_ref = CCV_NNC_GRAPH_REF(node)((node)->_heap_graph_ref ? (node)->_heap_graph_ref : (node
)->_inline_graph_ref)
[i] - 1;
4071 if (graph_ref >= 0)
4072 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep->sub_preps[graph_ref]);
4073 }
4074 } ccv_nnc_graph_visit_endfor} }
4075}
4076
4077const ccv_nnc_symbolic_graph_compile_param_t ccv_nnc_default_compile_params = {};
4078
4079void ccv_nnc_symbolic_graph_compile(const ccv_nnc_symbolic_graph_t* const symbolic_graph, const ccv_nnc_symbolic_graph_compile_param_t compile_params, const ccv_nnc_tensor_bind_t* const tensor_binds, const int tensor_bind_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, ccv_nnc_graph_t** const graph_ref, ccv_nnc_tensor_arena_t** const tensor_arena_ref, ccv_nnc_graph_exec_arena_t** const graph_exec_arena_ref)
4080{
4081 assert(graph_ref)((void) sizeof ((graph_ref) ? 1 : 0), __extension__ ({ if (graph_ref
) ; else __assert_fail ("graph_ref", "ccv_nnc_symbolic_graph_compile.c"
, 4081, __extension__ __PRETTY_FUNCTION__); }))
;
4082 assert(tensor_arena_ref)((void) sizeof ((tensor_arena_ref) ? 1 : 0), __extension__ ({
if (tensor_arena_ref) ; else __assert_fail ("tensor_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4082, __extension__ __PRETTY_FUNCTION__
); }))
;
4083 assert(graph_exec_arena_ref)((void) sizeof ((graph_exec_arena_ref) ? 1 : 0), __extension__
({ if (graph_exec_arena_ref) ; else __assert_fail ("graph_exec_arena_ref"
, "ccv_nnc_symbolic_graph_compile.c", 4083, __extension__ __PRETTY_FUNCTION__
); }))
;
4084 int i;
4085 // Cannot bind the multi-view.
4086 for (i = 0; i < tensor_bind_size; i++)
4087 {
4088 assert(tensor_binds[i].tensor)((void) sizeof ((tensor_binds[i].tensor) ? 1 : 0), __extension__
({ if (tensor_binds[i].tensor) ; else __assert_fail ("tensor_binds[i].tensor"
, "ccv_nnc_symbolic_graph_compile.c", 4088, __extension__ __PRETTY_FUNCTION__
); }))
;
4089 assert(!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor))((void) sizeof ((!((*(int*)(tensor_binds[i].tensor)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (!((*(int*)(tensor_binds[i].
tensor)) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("!CCV_IS_TENSOR_MULTIVIEW(tensor_binds[i].tensor)"
, "ccv_nnc_symbolic_graph_compile.c", 4089, __extension__ __PRETTY_FUNCTION__
); }))
;
4090 }
4091 ccv_nnc_symbolic_graph_prep_t* graph_prep = _ccv_nnc_symbolic_graph_prep_new(symbolic_graph, tensor_binds, tensor_bind_size, outputs, output_size, sources, source_size, destinations, destination_size, 0, 0, 0, 0);
4092 _ccv_nnc_symbolic_graph_prep_while_count_tensor(graph_prep);
4093 ccv_nnc_tensor_arena_t* tensor_arena = _ccv_nnc_tensor_arena_new(graph_prep, compile_params.allocator, 0, tensor_binds, tensor_bind_size);
4094 _ccv_nnc_tensor_arena_fixup_pair_ref_and_tape_var(tensor_arena, graph_prep, tensor_arena);
4095 *tensor_arena_ref = tensor_arena;
4096 // The above handled tensor allocation, now we need to materialize the graph from symbolic to real.
4097 _ccv_nnc_graph_fixup_pair(graph_prep, graph_prep);
4098 // Now tensor allocation is done, if there are any dup_breakpoints, I need to clean it up.
4099 _ccv_nnc_symbolic_graph_prep_dup_breakpoints_free(graph_prep);
4100 *graph_ref = graph_prep->graph;
4101 ccv_nnc_graph_exec_arena_t* graph_exec_arena = _ccv_nnc_graph_exec_arena_new(symbolic_graph, sources, source_size, destinations, destination_size, graph_prep, tensor_arena);
4102 _ccv_nnc_graph_exec_arena_topsort(graph_prep->graph, graph_exec_arena);
4103 _ccv_nnc_graph_exec_arena_fixup_pair_ref(graph_exec_arena, graph_prep, graph_exec_arena);
4104 *graph_exec_arena_ref = graph_exec_arena;
4105 _ccv_nnc_symbolic_graph_prep_free(graph_prep);
4106}
4107
4108static void _ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4109{
4110 // Buffers are inherited from above, no need to dealloc.
4111 int i;
4112 for (i = 0; i < tensor_arena->sub_arena_size; i++)
4113 if (tensor_arena->sub_arenas[i])
4114 _ccv_nnc_tensor_arena_free(tensor_arena->sub_arenas[i]);
4115 for (i = 0; i < tensor_arena->m_tensor_idx->rnum; i++)
4116 {
4117 ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)_ccv_nnc_tensor_metadata_get(tensor_arena->tensor_metadata, *(int*)ccv_array_get(tensor_arena->m_tensor_idx, i)((void*)(((char*)((tensor_arena->m_tensor_idx)->data)) +
(size_t)(tensor_arena->m_tensor_idx)->rsize * (size_t)
(i)))
);
4118 assert(mv && CCV_IS_TENSOR_MULTIVIEW(mv))((void) sizeof ((mv && ((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW
)) ? 1 : 0), __extension__ ({ if (mv && ((*(int*)(mv)
) & CCV_TENSOR_MULTIVIEW)) ; else __assert_fail ("mv && CCV_IS_TENSOR_MULTIVIEW(mv)"
, "ccv_nnc_symbolic_graph_compile.c", 4118, __extension__ __PRETTY_FUNCTION__
); }))
;
4119 ccv_nnc_tensor_multiview_free(*mv);
4120 }
4121 ccv_array_free(tensor_arena->tensor_metadata);
4122 ccv_array_free(tensor_arena->m_tensor_idx);
4123 if (tensor_arena->pb_vt_tensors)
4124 ccfreefree(tensor_arena->pb_vt_tensors);
4125 if (tensor_arena->vt_alias_r_refs_p)
4126 ccfreefree(tensor_arena->vt_alias_r_refs_p);
4127 if (tensor_arena->vt_sizes)
4128 ccfreefree(tensor_arena->vt_sizes);
4129 ccfreefree(tensor_arena);
4130}
4131
4132void ccv_nnc_tensor_bind_symbol(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_t* const tensor)
4133{
4134 assert(tensor_arena->graph_ref == (intptr_t)symbol.graph)((void) sizeof ((tensor_arena->graph_ref == (intptr_t)symbol
.graph) ? 1 : 0), __extension__ ({ if (tensor_arena->graph_ref
== (intptr_t)symbol.graph) ; else __assert_fail ("tensor_arena->graph_ref == (intptr_t)symbol.graph"
, "ccv_nnc_symbolic_graph_compile.c", 4134, __extension__ __PRETTY_FUNCTION__
); }))
;
1
Assuming field 'graph_ref' is equal to field 'graph'
2
Taking true branch
4135 assert(symbol.d < tensor_arena->vt_tensor_size)((void) sizeof ((symbol.d < tensor_arena->vt_tensor_size
) ? 1 : 0), __extension__ ({ if (symbol.d < tensor_arena->
vt_tensor_size) ; else __assert_fail ("symbol.d < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4135, __extension__ __PRETTY_FUNCTION__
); }))
;
3
Assuming field 'd' is < field 'vt_tensor_size'
4
Taking true branch
4136 assert(symbol.d >= 0)((void) sizeof ((symbol.d >= 0) ? 1 : 0), __extension__ ({
if (symbol.d >= 0) ; else __assert_fail ("symbol.d >= 0"
, "ccv_nnc_symbolic_graph_compile.c", 4136, __extension__ __PRETTY_FUNCTION__
); }))
;
5
Assuming field 'd' is >= 0
6
Taking true branch
4137 // Only allocate this on-demand because not everyone uses this ccv_nnc_tensor_bind_symbol method.
4138 int i;
4139 if (!tensor_arena->pb_vt_tensors)
7
Assuming field 'pb_vt_tensors' is null
8
Taking true branch
4140 {
4141 tensor_arena->pb_vt_tensors = (ccv_numeric_data_t*)cccalloccalloc(tensor_arena->vt_tensor_size, sizeof(ccv_numeric_data_t));
4142 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
9
Assuming 'i' is < field 'vt_tensor_size'
10
Loop condition is true. Entering loop body
13
Assuming 'i' is >= field 'vt_tensor_size'
14
Loop condition is false. Execution continues on line 4146
4143 if (tensor_arena->vt_tensors[i])
11
Assuming pointer value is null
12
Taking false branch
4144 tensor_arena->pb_vt_tensors[i] = tensor_arena->vt_tensors[i]->data;
4145 }
4146 if (!tensor_arena->vt_alias_r_refs_p)
15
Assuming field 'vt_alias_r_refs_p' is non-null
4147 {
4148 tensor_arena->vt_alias_r_refs_p = (int*)cccalloccalloc(tensor_arena->vt_tensor_size * 2, sizeof(int));
4149 tensor_arena->vt_alias_r_refs = tensor_arena->vt_alias_r_refs_p + tensor_arena->vt_tensor_size;
4150 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4151 if (tensor_arena->vt_alias_refs[i])
4152 {
4153 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4154 assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
>= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4154, __extension__ __PRETTY_FUNCTION__
); }))
;
4155 ++tensor_arena->vt_alias_r_refs_p[alias_ref]; // Count how many alias there are.
4156 }
4157 int refp = 0;
4158 for (i = 0; i < tensor_arena->vt_tensor_size; i++) // Allocate each with aliases position on vt_alias_r_refs. It points to the end.
4159 if (tensor_arena->vt_alias_r_refs_p[i])
4160 refp = (tensor_arena->vt_alias_r_refs_p[i] += refp);
4161 else
4162 tensor_arena->vt_alias_r_refs_p[i] = -1; // This has no refs.
4163 for (i = refp; i < tensor_arena->vt_tensor_size; i++)
4164 tensor_arena->vt_alias_r_refs[i] = -1; // These are not allocated.
4165 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4166 if (tensor_arena->vt_alias_refs[i])
4167 {
4168 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4169 assert(alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size)((void) sizeof ((alias_ref >= 0 && alias_ref < tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (alias_ref
>= 0 && alias_ref < tensor_arena->vt_tensor_size
) ; else __assert_fail ("alias_ref >= 0 && alias_ref < tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4169, __extension__ __PRETTY_FUNCTION__
); }))
;
4170 const int pos = --tensor_arena->vt_alias_r_refs_p[alias_ref];
4171 assert(pos >= 0)((void) sizeof ((pos >= 0) ? 1 : 0), __extension__ ({ if (
pos >= 0) ; else __assert_fail ("pos >= 0", "ccv_nnc_symbolic_graph_compile.c"
, 4171, __extension__ __PRETTY_FUNCTION__); }))
;
4172 tensor_arena->vt_alias_r_refs[pos] = i;
4173 }
4174 }
4175 const int symbol_d = tensor_arena->vt_alias_refs[symbol.d] ? tensor_arena->vt_alias_refs[symbol.d] - 1 : symbol.d;
16
Taking false branch
17
Assuming the condition is true
18
'?' condition is true
4176 if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
19
Assuming the condition is false
20
Taking false branch
4177 {
4178 assert(((ccv_nnc_tensor_view_t*)tensor)->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor)->off == 0
) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t*)tensor
)->off == 0) ; else __assert_fail ("((ccv_nnc_tensor_view_t*)tensor)->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4178, __extension__ __PRETTY_FUNCTION__
); }))
; // I cannot handle off > 0 at the moment, it is possible, but requires additional verifications.
4179 assert((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 &&((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4181, __extension__ __PRETTY_FUNCTION__
); }))
4180 ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) ||((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4181, __extension__ __PRETTY_FUNCTION__
); }))
4181 (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof (((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t
*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor
->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors
[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if ((ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor
)->stride) == 0 && ccv_nnc_tensor_count(tensor->
info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[
symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor
)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("(ccv_nnc_dimension_count(((ccv_nnc_tensor_view_t*)tensor)->stride) == 0 && ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)) || (size_t)((ccv_nnc_tensor_view_t*)tensor)->stride[0] * tensor->info.dim[0] >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4181, __extension__ __PRETTY_FUNCTION__
); }))
;
4182 } else
4183 { assert(ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info))((void) sizeof ((ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ? 1 : 0), __extension__
({ if (ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count
(tensor_arena->vt_tensors[symbol_d]->info)) ; else __assert_fail
("ccv_nnc_tensor_count(tensor->info) >= ccv_nnc_tensor_count(tensor_arena->vt_tensors[symbol_d]->info)"
, "ccv_nnc_symbolic_graph_compile.c", 4183, __extension__ __PRETTY_FUNCTION__
); }))
; }
21
Assuming the condition is true
22
Taking true branch
4184 if (CCV_IS_TENSOR_VIEW(tensor_arena->vt_tensors[symbol.d])((*(int*)(tensor_arena->vt_tensors[symbol.d])) & CCV_TENSOR_VIEW
)
)
23
Dereference of null pointer
4185 { assert(((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0)((void) sizeof ((((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors
[symbol.d])->off == 0) ? 1 : 0), __extension__ ({ if (((ccv_nnc_tensor_view_t
*)tensor_arena->vt_tensors[symbol.d])->off == 0) ; else
__assert_fail ("((ccv_nnc_tensor_view_t*)tensor_arena->vt_tensors[symbol.d])->off == 0"
, "ccv_nnc_symbolic_graph_compile.c", 4185, __extension__ __PRETTY_FUNCTION__
); }))
; }
4186 tensor_arena->vt_tensors[symbol_d]->data = tensor->data;
4187 if (tensor_arena->vt_alias_r_refs_p[symbol_d] >= 0)
4188 for (i = tensor_arena->vt_alias_r_refs_p[symbol_d]; i < tensor_arena->vt_tensor_size; i++)
4189 {
4190 const int d = tensor_arena->vt_alias_r_refs[i];
4191 if (d < 0 || symbol_d + 1 != tensor_arena->vt_alias_refs[d]) // Doesn't match, reached the end of it.
4192 break;
4193 ccv_nnc_tensor_t* const d_tensor = tensor_arena->vt_tensors[d];
4194 d_tensor->info.datatype = tensor->info.datatype;
4195 d_tensor->info.reserved = tensor->info.reserved;
4196 if (CCV_IS_TENSOR_VIEW(d_tensor)((*(int*)(d_tensor)) & CCV_TENSOR_VIEW))
4197 ccv_nnc_tensor_data(tensor->info, tensor->data.u8, ((ccv_nnc_tensor_view_t*)d_tensor)->off + tensor->dataof, &d_tensor->data, &d_tensor->dataof);
4198 else {
4199 d_tensor->data.u8 = tensor->data.u8;
4200 d_tensor->dataof = tensor->dataof;
4201 }
4202 }
4203}
4204
4205void ccv_nnc_tensor_arena_clear_bindings(ccv_nnc_tensor_arena_t* const tensor_arena)
4206{
4207 if (!tensor_arena->pb_vt_tensors)
4208 return;
4209 int i;
4210 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4211 if (tensor_arena->vt_tensors[i])
4212 tensor_arena->vt_tensors[i]->data = tensor_arena->pb_vt_tensors[i];
4213}
4214
4215uint64_t ccv_nnc_tensor_arena_size(const ccv_nnc_tensor_arena_t* const tensor_arena)
4216{
4217 uint64_t total_size = 0;
4218 int i;
4219 for (i = 0; i < tensor_arena->buffer_size; i++)
4220 total_size += tensor_arena->buffers[i].size;
4221 return total_size;
4222}
4223
4224static void _ccv_nnc_multiview_update_params(ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_param_t params)
4225{
4226 int i;
4227 if (mv->it)
4228 mv->it->info = params;
4229 for (i = 0; i < mv->repeat + mv->kind; i++)
4230 {
4231 ccv_nnc_tensor_t* tensor = CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[i];
4232 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4233 _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, params);
4234 else
4235 tensor->info = params;
4236 }
4237}
4238
4239int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph)
4240{
4241 int i;
4242 assert(graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size)((void) sizeof ((graph->tensor_symbol_info->rnum >= tensor_arena
->vt_tensor_size) ? 1 : 0), __extension__ ({ if (graph->
tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size
) ; else __assert_fail ("graph->tensor_symbol_info->rnum >= tensor_arena->vt_tensor_size"
, "ccv_nnc_symbolic_graph_compile.c", 4242, __extension__ __PRETTY_FUNCTION__
); }))
;
4243 if (!tensor_arena->vt_sizes) // Keep the original size so we can check against to see if we will overflow.
4244 {
4245 tensor_arena->vt_sizes = (size_t*)ccmallocmalloc(sizeof(size_t) * tensor_arena->vt_tensor_size);
4246 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4247 if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4248 {
4249 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4250 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4251 {
4252 ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
4253 while (CCV_IS_TENSOR_MULTIVIEW(mv)((*(int*)(mv)) & CCV_TENSOR_MULTIVIEW))
4254 mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)((mv)->_heap_data ? (mv)->_heap_data : (mv)->_inline_data
)
[0]);
4255 tensor = (ccv_nnc_tensor_t*)mv;
4256 }
4257 tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info);
4258 }
4259 }
4260 int flag = 0;
4261 for (i = 0; !flag && i < tensor_arena->vt_tensor_size; i++)
4262 if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i])
4263 {
4264 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
(size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)))
;
4265 ccv_nnc_tensor_param_t params = symbol_info->info;
4266 params.datatype = tensor_arena->vt_tensors[i]->info.datatype;
4267 params.reserved = tensor_arena->vt_tensors[i]->info.reserved;
4268 flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params));
4269 }
4270 if (flag)
4271 return -1;
4272 for (i = 0; i < tensor_arena->vt_tensor_size; i++)
4273 if (tensor_arena->vt_tensors[i])
4274 {
4275 ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i)((void*)(((char*)((graph->tensor_symbol_info)->data)) +
(size_t)(graph->tensor_symbol_info)->rsize * (size_t)(
i)))
;
4276 ccv_nnc_tensor_t* tensor = tensor_arena->vt_tensors[i];
4277 if (CCV_IS_TENSOR_MULTIVIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_MULTIVIEW))
4278 {
4279 assert(!tensor_arena->vt_alias_refs[i])((void) sizeof ((!tensor_arena->vt_alias_refs[i]) ? 1 : 0)
, __extension__ ({ if (!tensor_arena->vt_alias_refs[i]) ; else
__assert_fail ("!tensor_arena->vt_alias_refs[i]", "ccv_nnc_symbolic_graph_compile.c"
, 4279, __extension__ __PRETTY_FUNCTION__); }))
;
4280 _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info);
4281 } else if (!tensor_arena->vt_alias_refs[i]) {
4282 ccv_nnc_tensor_param_t params = symbol_info->info;
4283 params.datatype = tensor->info.datatype;
4284 params.reserved = tensor->info.reserved;
4285 tensor->info = params;
4286 } else {
4287 off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs);
4288 ccv_nnc_tensor_param_t params = symbol_info->info;
4289 params.datatype = tensor->info.datatype;
4290 params.reserved = tensor->info.reserved;
4291 tensor->info = params;
4292 const int alias_ref = tensor_arena->vt_alias_refs[i] - 1;
4293 ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof);
4294 if (CCV_IS_TENSOR_VIEW(tensor)((*(int*)(tensor)) & CCV_TENSOR_VIEW))
4295 {
4296 ((ccv_nnc_tensor_view_t*)tensor)->off = off;
4297 memcpy(((ccv_nnc_tensor_view_t*)tensor)->stride, symbol_info->stride, sizeof(((ccv_nnc_tensor_view_t*)tensor)->stride));
4298 }
4299 }
4300 }
4301 // Should handle sub_tensor_arena, don't do that at the moment.
4302 assert(!graph->sub_graphs)((void) sizeof ((!graph->sub_graphs) ? 1 : 0), __extension__
({ if (!graph->sub_graphs) ; else __assert_fail ("!graph->sub_graphs"
, "ccv_nnc_symbolic_graph_compile.c", 4302, __extension__ __PRETTY_FUNCTION__
); }))
;
4303 return 0;
4304}
4305
4306void ccv_nnc_graph_exec_reinit(ccv_nnc_graph_exec_arena_t* const graph_exec_arena, ccv_nnc_graph_t* const graph, const ccv_nnc_symbolic_graph_t* const symbolic_graph)
4307{
4308 assert(symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size)((void) sizeof ((symbolic_graph->exec_symbol_info->rnum
>= graph_exec_arena->graph_exec_size) ? 1 : 0), __extension__
({ if (symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena
->graph_exec_size) ; else __assert_fail ("symbolic_graph->exec_symbol_info->rnum >= graph_exec_arena->graph_exec_size"
, "ccv_nnc_symbolic_graph_compile.c", 4308, __extension__ __PRETTY_FUNCTION__
); }))
;
4309 int i;
4310 for (i = 0; i < graph_exec_arena->graph_exec_size; i++)
4311 {
4312 const ccv_nnc_graph_exec_t graph_exec = graph_exec_arena->graph_execs[i];
4313 if (graph_exec.d < 0)
4314 continue;
4315 const ccv_nnc_cmd_t existing_cmd = ccv_nnc_graph_exec_cmd(graph, graph_exec);
4316 const ccv_nnc_graph_exec_symbol_info_t* const symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(symbolic_graph->exec_symbol_info, i)((void*)(((char*)((symbolic_graph->exec_symbol_info)->data
)) + (size_t)(symbolic_graph->exec_symbol_info)->rsize *
(size_t)(i)))
;
4317 ccv_nnc_cmd_t new_cmd = symbol_info->cmd;
4318 if (new_cmd.cmd == existing_cmd.cmd) // If the command matches, replacing the backend and algorithm to the existing one, which hypothetically has been autotuned..
4319 {
4320 new_cmd.backend = existing_cmd.backend;
4321 new_cmd.algorithm = existing_cmd.algorithm;
4322 }
4323 ccv_nnc_graph_exec_set(graph, graph_exec, new_cmd);
4324 }
4325}
4326
4327void ccv_nnc_tensor_arena_buffer_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4328{
4329 int i;
4330 for (i = 0; i < tensor_arena->buffer_size; i++)
4331 {
4332 if (!tensor_arena->buffers[i].ptr)
4333 continue;
4334 const int buffer_type = tensor_arena->buffers[i].type;;
4335 const int memory_type = CCV_TENSOR_GET_MEMORY(buffer_type)((buffer_type) & 0x3);
4336#ifdef HAVE_CUDA1
4337 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4338 if (memory_type == CCV_TENSOR_GPU_MEMORY)
4339 {
4340 if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4341 tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4342 else
4343 cufree(device_id, tensor_arena->buffers[i].ptr);
4344 } else {
4345 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4345, __extension__ __PRETTY_FUNCTION__
); }))
;
4346 if (tensor_arena->buffers[i].pin_mem)
4347 cuhostfree(tensor_arena->buffers[i].ptr);
4348 else
4349 ccfreefree(tensor_arena->buffers[i].ptr);
4350 }
4351#elif defined(HAVE_MPS)
4352 const int device_id = CCV_TENSOR_GET_DEVICE_ID(buffer_type)(((buffer_type) & 0xfff00) >> 8);
4353 if (memory_type == CCV_TENSOR_GPU_MEMORY)
4354 {
4355 // if (tensor_arena->allocator.isa && tensor_arena->allocator.isa->free)
4356 // tensor_arena->allocator.isa->free(tensor_arena->buffers[i].ptr, tensor_arena->allocator.context.free);
4357 // else
4358 mpheapfree(device_id, tensor_arena->buffers[i].ptr);
4359 } else {
4360 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4360, __extension__ __PRETTY_FUNCTION__
); }))
;
4361 ccfreefree(tensor_arena->buffers[i].ptr);
4362 }
4363#else
4364 assert(memory_type == CCV_TENSOR_CPU_MEMORY)((void) sizeof ((memory_type == CCV_TENSOR_CPU_MEMORY) ? 1 : 0
), __extension__ ({ if (memory_type == CCV_TENSOR_CPU_MEMORY)
; else __assert_fail ("memory_type == CCV_TENSOR_CPU_MEMORY"
, "ccv_nnc_symbolic_graph_compile.c", 4364, __extension__ __PRETTY_FUNCTION__
); }))
;
4365 ccfreefree(tensor_arena->buffers[i].ptr);
4366#endif
4367 tensor_arena->buffers[i].ptr = 0;
4368 }
4369 // For now, the life-cycle of the disposers lives with the buffer. It may ends before the tensor arena deallocates.
4370 if (tensor_arena->disposers)
4371 {
4372 for (i = 0; i < tensor_arena->disposers->rnum; i++)
4373 {
4374 ccv_nnc_arena_disposer_t* const disposer = (ccv_nnc_arena_disposer_t*)ccv_array_get(tensor_arena->disposers, i)((void*)(((char*)((tensor_arena->disposers)->data)) + (
size_t)(tensor_arena->disposers)->rsize * (size_t)(i)))
;
4375 disposer->dispose(disposer->ptr, disposer->userdata);
4376 }
4377 ccv_array_free(tensor_arena->disposers);
4378 tensor_arena->disposers = 0;
4379 }
4380}
4381
4382void ccv_nnc_tensor_arena_free(ccv_nnc_tensor_arena_t* const tensor_arena)
4383{
4384 ccv_nnc_tensor_arena_buffer_free(tensor_arena);
4385 _ccv_nnc_tensor_arena_free(tensor_arena);
4386}
4387
4388void ccv_nnc_graph_exec_arena_free(ccv_nnc_graph_exec_arena_t* const graph_exec_arena)
4389{
4390 int i;
4391 for (i = 0; i < graph_exec_arena->sub_arena_size; i++)
4392 if (graph_exec_arena->sub_arenas[i])
4393 ccv_nnc_graph_exec_arena_free(graph_exec_arena->sub_arenas[i]);
4394 ccfreefree(graph_exec_arena);
4395}